|
1 """defines a validating HTML parser used in web application tests""" |
|
2 |
|
3 import re |
|
4 from StringIO import StringIO |
|
5 |
|
6 from lxml import etree |
|
7 from lxml.builder import E |
|
8 |
|
9 from cubicweb.common.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE, CW_XHTML_EXTENSIONS |
|
10 |
|
11 STRICT_DOCTYPE = str(STRICT_DOCTYPE % CW_XHTML_EXTENSIONS).strip() |
|
12 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE % CW_XHTML_EXTENSIONS).strip() |
|
13 |
|
14 ERR_COUNT = 0 |
|
15 |
|
16 class Validator(object): |
|
17 |
|
18 def parse_string(self, data, sysid=None): |
|
19 try: |
|
20 data = self.preprocess_data(data) |
|
21 return PageInfo(data, etree.fromstring(data, self.parser)) |
|
22 except etree.XMLSyntaxError, exc: |
|
23 def save_in(fname=''): |
|
24 file(fname, 'w').write(data) |
|
25 new_exc = AssertionError(u'invalid xml %s' % exc) |
|
26 new_exc.position = exc.position |
|
27 raise new_exc |
|
28 |
|
29 def preprocess_data(self, data): |
|
30 return data |
|
31 |
|
32 |
|
33 class DTDValidator(Validator): |
|
34 def __init__(self): |
|
35 Validator.__init__(self) |
|
36 self.parser = etree.XMLParser(dtd_validation=True) |
|
37 |
|
38 def preprocess_data(self, data): |
|
39 """used to fix potential blockquote mess generated by docutils""" |
|
40 if STRICT_DOCTYPE not in data: |
|
41 return data |
|
42 # parse using transitional DTD |
|
43 data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) |
|
44 tree = etree.fromstring(data, self.parser) |
|
45 namespace = tree.nsmap.get(None) |
|
46 # this is the list of authorized child tags for <blockquote> nodes |
|
47 expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
|
48 'fieldset table form noscript ins del script'.split() |
|
49 if namespace: |
|
50 blockquotes = tree.findall('.//{%s}blockquote' % namespace) |
|
51 expected = ['{%s}%s' % (namespace, tag) for tag in expected] |
|
52 else: |
|
53 blockquotes = tree.findall('.//blockquote') |
|
54 # quick and dirty approach: remove all blockquotes |
|
55 for blockquote in blockquotes: |
|
56 parent = blockquote.getparent() |
|
57 parent.remove(blockquote) |
|
58 ## # for each blockquote, wrap unauthorized child in a div |
|
59 ## for blockquote in blockquotes: |
|
60 ## if len(blockquote): |
|
61 ## needs_wrap = [(index, child) for index, child in enumerate(blockquote) |
|
62 ## if child.tag not in expected] |
|
63 ## for index, child in needs_wrap: |
|
64 ## # the child is automatically popped from blockquote when |
|
65 ## # its parent is changed |
|
66 ## div = E.div(child) |
|
67 ## blockquote.insert(index, div) |
|
68 ## elif blockquote.text: |
|
69 ## div = E.div(blockquote.text) |
|
70 ## blockquote.text = None |
|
71 ## blockquote.append(div) |
|
72 data = etree.tostring(tree) |
|
73 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data) |
|
74 |
|
75 |
|
76 class SaxOnlyValidator(Validator): |
|
77 |
|
78 def __init__(self): |
|
79 Validator.__init__(self) |
|
80 self.parser = etree.XMLParser() |
|
81 |
|
82 class HTMLValidator(Validator): |
|
83 |
|
84 def __init__(self): |
|
85 Validator.__init__(self) |
|
86 self.parser = etree.HTMLParser() |
|
87 |
|
88 |
|
89 |
|
90 class PageInfo(object): |
|
91 """holds various informations on the view's output""" |
|
92 def __init__(self, source, root): |
|
93 self.source = source |
|
94 self.etree = root |
|
95 self.source = source |
|
96 self.raw_text = u''.join(root.xpath('//text()')) |
|
97 self.namespace = self.etree.nsmap |
|
98 self.default_ns = self.namespace.get(None) |
|
99 self.a_tags = self.find_tag('a') |
|
100 self.h1_tags = self.find_tag('h1') |
|
101 self.h2_tags = self.find_tag('h2') |
|
102 self.h3_tags = self.find_tag('h3') |
|
103 self.h4_tags = self.find_tag('h4') |
|
104 self.input_tags = self.find_tag('input') |
|
105 self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] |
|
106 |
|
107 def find_tag(self, tag): |
|
108 """return a list which contains text of all "tag" elements """ |
|
109 if self.default_ns is None: |
|
110 iterstr = ".//%s" % tag |
|
111 else: |
|
112 iterstr = ".//{%s}%s" % (self.default_ns, tag) |
|
113 if tag in ('a', 'input'): |
|
114 return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)] |
|
115 return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)] |
|
116 |
|
117 def appears(self, text): |
|
118 """returns True if <text> appears in the page""" |
|
119 return text in self.raw_text |
|
120 |
|
121 def __contains__(self, text): |
|
122 return text in self.source |
|
123 |
|
124 def has_title(self, text, level=None): |
|
125 """returns True if <h?>text</h?> |
|
126 |
|
127 :param level: the title's level (1 for h1, 2 for h2, etc.) |
|
128 """ |
|
129 if level is None: |
|
130 for hlist in self.title_tags: |
|
131 if text in hlist: |
|
132 return True |
|
133 return False |
|
134 else: |
|
135 hlist = self.title_tags[level - 1] |
|
136 return text in hlist |
|
137 |
|
138 def has_title_regexp(self, pattern, level=None): |
|
139 """returns True if <h?>pattern</h?>""" |
|
140 sre = re.compile(pattern) |
|
141 if level is None: |
|
142 for hlist in self.title_tags: |
|
143 for title in hlist: |
|
144 if sre.match(title): |
|
145 return True |
|
146 return False |
|
147 else: |
|
148 hlist = self.title_tags[level - 1] |
|
149 for title in hlist: |
|
150 if sre.match(title): |
|
151 return True |
|
152 return False |
|
153 |
|
154 def has_link(self, text, url=None): |
|
155 """returns True if <a href=url>text</a> was found in the page""" |
|
156 for link_text, attrs in self.a_tags: |
|
157 if text == link_text: |
|
158 if url is None: |
|
159 return True |
|
160 try: |
|
161 href = attrs['href'] |
|
162 if href == url: |
|
163 return True |
|
164 except KeyError: |
|
165 continue |
|
166 return False |
|
167 |
|
168 def has_link_regexp(self, pattern, url=None): |
|
169 """returns True if <a href=url>pattern</a> was found in the page""" |
|
170 sre = re.compile(pattern) |
|
171 for link_text, attrs in self.a_tags: |
|
172 if sre.match(link_text): |
|
173 if url is None: |
|
174 return True |
|
175 try: |
|
176 href = attrs['href'] |
|
177 if href == url: |
|
178 return True |
|
179 except KeyError: |
|
180 continue |
|
181 return False |