1 # copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved. |
|
2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr |
|
3 # |
|
4 # This file is part of CubicWeb. |
|
5 # |
|
6 # CubicWeb is free software: you can redistribute it and/or modify it under the |
|
7 # terms of the GNU Lesser General Public License as published by the Free |
|
8 # Software Foundation, either version 2.1 of the License, or (at your option) |
|
9 # any later version. |
|
10 # |
|
11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT |
|
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
|
13 # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more |
|
14 # details. |
|
15 # |
|
16 # You should have received a copy of the GNU Lesser General Public License along |
|
17 # with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
|
18 """defines a validating HTML parser used in web application tests""" |
|
19 |
|
20 import re |
|
21 import sys |
|
22 from xml import sax |
|
23 from io import BytesIO |
|
24 |
|
25 from lxml import etree |
|
26 |
|
27 from logilab.common.deprecation import class_deprecated, class_renamed |
|
28 |
|
29 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE |
|
30 |
|
31 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
|
32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
|
33 |
|
34 ERR_COUNT = 0 |
|
35 |
|
36 _REM_SCRIPT_RGX = re.compile(br"<script[^>]*>.*?</script>", re.M|re.I|re.S) |
|
37 def _remove_script_tags(data): |
|
38 """Remove the script (usually javascript) tags to help the lxml |
|
39 XMLParser / HTMLParser do their job. Without that, they choke on |
|
40 tags embedded in JS strings. |
|
41 """ |
|
42 # Notice we may want to use lxml cleaner, but it's far too intrusive: |
|
43 # |
|
44 # cleaner = Cleaner(scripts=True, |
|
45 # javascript=False, |
|
46 # comments=False, |
|
47 # style=False, |
|
48 # links=False, |
|
49 # meta=False, |
|
50 # page_structure=False, |
|
51 # processing_instructions=False, |
|
52 # embedded=False, |
|
53 # frames=False, |
|
54 # forms=False, |
|
55 # annoying_tags=False, |
|
56 # remove_tags=(), |
|
57 # remove_unknown_tags=False, |
|
58 # safe_attrs_only=False, |
|
59 # add_nofollow=False) |
|
60 # >>> cleaner.clean_html('<body></body>') |
|
61 # '<span></span>' |
|
62 # >>> cleaner.clean_html('<!DOCTYPE html><body></body>') |
|
63 # '<html><body></body></html>' |
|
64 # >>> cleaner.clean_html('<body><div/></body>') |
|
65 # '<div></div>' |
|
66 # >>> cleaner.clean_html('<html><body><div/><br></body><html>') |
|
67 # '<html><body><div></div><br></body></html>' |
|
68 # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>') |
|
69 # '<html><body><div></div><br><span></span></body></html>' |
|
70 # |
|
71 # using that, we'll miss most actual validation error we want to |
|
72 # catch. For now, use dumb regexp |
|
73 return _REM_SCRIPT_RGX.sub(b'', data) |
|
74 |
|
75 |
|
76 class Validator(object): |
|
77 """ base validator API """ |
|
78 parser = None |
|
79 |
|
80 def parse_string(self, source): |
|
81 etree = self._parse(self.preprocess_data(source)) |
|
82 return PageInfo(source, etree) |
|
83 |
|
84 def preprocess_data(self, data): |
|
85 return data |
|
86 |
|
87 def _parse(self, pdata): |
|
88 try: |
|
89 return etree.fromstring(pdata, self.parser) |
|
90 except etree.XMLSyntaxError as exc: |
|
91 new_exc = AssertionError(u'invalid document: %s' % exc) |
|
92 new_exc.position = exc.position |
|
93 raise new_exc |
|
94 |
|
95 |
|
96 class DTDValidator(Validator): |
|
97 def __init__(self): |
|
98 Validator.__init__(self) |
|
99 # XXX understand what's happening under windows |
|
100 self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32') |
|
101 |
|
102 def preprocess_data(self, data): |
|
103 """used to fix potential blockquote mess generated by docutils""" |
|
104 if STRICT_DOCTYPE not in data: |
|
105 return data |
|
106 # parse using transitional DTD |
|
107 data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) |
|
108 tree = self._parse(data) |
|
109 namespace = tree.nsmap.get(None) |
|
110 # this is the list of authorized child tags for <blockquote> nodes |
|
111 expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
|
112 'fieldset table form noscript ins del script'.split() |
|
113 if namespace: |
|
114 blockquotes = tree.findall('.//{%s}blockquote' % namespace) |
|
115 expected = ['{%s}%s' % (namespace, tag) for tag in expected] |
|
116 else: |
|
117 blockquotes = tree.findall('.//blockquote') |
|
118 # quick and dirty approach: remove all blockquotes |
|
119 for blockquote in blockquotes: |
|
120 parent = blockquote.getparent() |
|
121 parent.remove(blockquote) |
|
122 data = etree.tostring(tree) |
|
123 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % ( |
|
124 STRICT_DOCTYPE, data) |
|
125 |
|
126 |
|
127 class XMLValidator(Validator): |
|
128 """XML validator, checks that XML is well-formed and used XMLNS are defined""" |
|
129 |
|
130 def __init__(self): |
|
131 Validator.__init__(self) |
|
132 self.parser = etree.XMLParser() |
|
133 |
|
134 SaxOnlyValidator = class_renamed('SaxOnlyValidator', |
|
135 XMLValidator, |
|
136 '[3.17] you should use the ' |
|
137 'XMLValidator class instead') |
|
138 |
|
139 |
|
140 class XMLSyntaxValidator(Validator): |
|
141 """XML syntax validator, check XML is well-formed""" |
|
142 |
|
143 class MySaxErrorHandler(sax.ErrorHandler): |
|
144 """override default handler to avoid choking because of unknown entity""" |
|
145 def fatalError(self, exception): |
|
146 # XXX check entity in htmlentitydefs |
|
147 if not str(exception).endswith('undefined entity'): |
|
148 raise exception |
|
149 _parser = sax.make_parser() |
|
150 _parser.setContentHandler(sax.handler.ContentHandler()) |
|
151 _parser.setErrorHandler(MySaxErrorHandler()) |
|
152 |
|
153 def __init__(self): |
|
154 super(XMLSyntaxValidator, self).__init__() |
|
155 # XMLParser() wants xml namespaces defined |
|
156 # XMLParser(recover=True) will accept almost anything |
|
157 # |
|
158 # -> use the later but preprocess will check xml well-formness using a |
|
159 # dumb SAX parser |
|
160 self.parser = etree.XMLParser(recover=True) |
|
161 |
|
162 def preprocess_data(self, data): |
|
163 return _remove_script_tags(data) |
|
164 |
|
165 def _parse(self, data): |
|
166 inpsrc = sax.InputSource() |
|
167 inpsrc.setByteStream(BytesIO(data)) |
|
168 try: |
|
169 self._parser.parse(inpsrc) |
|
170 except sax.SAXParseException as exc: |
|
171 new_exc = AssertionError(u'invalid document: %s' % exc) |
|
172 new_exc.position = (exc._linenum, exc._colnum) |
|
173 raise new_exc |
|
174 return super(XMLSyntaxValidator, self)._parse(data) |
|
175 |
|
176 |
|
177 class HTMLValidator(Validator): |
|
178 |
|
179 def __init__(self): |
|
180 Validator.__init__(self) |
|
181 self.parser = etree.HTMLParser(recover=False) |
|
182 |
|
183 def preprocess_data(self, data): |
|
184 return _remove_script_tags(data) |
|
185 |
|
186 |
|
187 class PageInfo(object): |
|
188 """holds various informations on the view's output""" |
|
189 def __init__(self, source, root): |
|
190 self.source = source |
|
191 self.etree = root |
|
192 self.raw_text = u''.join(root.xpath('//text()')) |
|
193 self.namespace = self.etree.nsmap |
|
194 self.default_ns = self.namespace.get(None) |
|
195 self.a_tags = self.find_tag('a') |
|
196 self.h1_tags = self.find_tag('h1') |
|
197 self.h2_tags = self.find_tag('h2') |
|
198 self.h3_tags = self.find_tag('h3') |
|
199 self.h4_tags = self.find_tag('h4') |
|
200 self.input_tags = self.find_tag('input') |
|
201 self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] |
|
202 |
|
203 def _iterstr(self, tag): |
|
204 if self.default_ns is None: |
|
205 return ".//%s" % tag |
|
206 else: |
|
207 return ".//{%s}%s" % (self.default_ns, tag) |
|
208 |
|
209 def matching_nodes(self, tag, **attrs): |
|
210 for elt in self.etree.iterfind(self._iterstr(tag)): |
|
211 eltattrs = elt.attrib |
|
212 for attr, value in attrs.items(): |
|
213 try: |
|
214 if eltattrs[attr] != value: |
|
215 break |
|
216 except KeyError: |
|
217 break |
|
218 else: # all attributes match |
|
219 yield elt |
|
220 |
|
221 def has_tag(self, tag, nboccurs=1, **attrs): |
|
222 """returns True if tag with given attributes appears in the page |
|
223 `nbtimes` (any if None) |
|
224 """ |
|
225 for elt in self.matching_nodes(tag, **attrs): |
|
226 if nboccurs is None: # no need to check number of occurences |
|
227 return True |
|
228 if not nboccurs: # too much occurences |
|
229 return False |
|
230 nboccurs -= 1 |
|
231 if nboccurs == 0: # correct number of occurences |
|
232 return True |
|
233 return False # no matching tag/attrs |
|
234 |
|
235 def find_tag(self, tag, gettext=True): |
|
236 """return a list which contains text of all "tag" elements """ |
|
237 iterstr = self._iterstr(tag) |
|
238 if not gettext or tag in ('a', 'input'): |
|
239 return [(elt.text, elt.attrib) |
|
240 for elt in self.etree.iterfind(iterstr)] |
|
241 return [u''.join(elt.xpath('.//text()')) |
|
242 for elt in self.etree.iterfind(iterstr)] |
|
243 |
|
244 def appears(self, text): |
|
245 """returns True if <text> appears in the page""" |
|
246 return text in self.raw_text |
|
247 |
|
248 def __contains__(self, text): |
|
249 return text in self.source |
|
250 |
|
251 def has_title(self, text, level=None): |
|
252 """returns True if <h?>text</h?> |
|
253 |
|
254 :param level: the title's level (1 for h1, 2 for h2, etc.) |
|
255 """ |
|
256 if level is None: |
|
257 for hlist in self.title_tags: |
|
258 if text in hlist: |
|
259 return True |
|
260 return False |
|
261 else: |
|
262 hlist = self.title_tags[level - 1] |
|
263 return text in hlist |
|
264 |
|
265 def has_title_regexp(self, pattern, level=None): |
|
266 """returns True if <h?>pattern</h?>""" |
|
267 sre = re.compile(pattern) |
|
268 if level is None: |
|
269 for hlist in self.title_tags: |
|
270 for title in hlist: |
|
271 if sre.match(title): |
|
272 return True |
|
273 return False |
|
274 else: |
|
275 hlist = self.title_tags[level - 1] |
|
276 for title in hlist: |
|
277 if sre.match(title): |
|
278 return True |
|
279 return False |
|
280 |
|
281 def has_link(self, text, url=None): |
|
282 """returns True if <a href=url>text</a> was found in the page""" |
|
283 for link_text, attrs in self.a_tags: |
|
284 if text == link_text: |
|
285 if url is None: |
|
286 return True |
|
287 try: |
|
288 href = attrs['href'] |
|
289 if href == url: |
|
290 return True |
|
291 except KeyError: |
|
292 continue |
|
293 return False |
|
294 |
|
295 def has_link_regexp(self, pattern, url=None): |
|
296 """returns True if <a href=url>pattern</a> was found in the page""" |
|
297 sre = re.compile(pattern) |
|
298 for link_text, attrs in self.a_tags: |
|
299 if sre.match(link_text): |
|
300 if url is None: |
|
301 return True |
|
302 try: |
|
303 href = attrs['href'] |
|
304 if href == url: |
|
305 return True |
|
306 except KeyError: |
|
307 continue |
|
308 return False |
|
309 |
|
310 VALMAP = {None: None, |
|
311 'dtd': DTDValidator, |
|
312 'xml': XMLValidator, |
|
313 'html': HTMLValidator, |
|
314 } |
|