17 # with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
17 # with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
18 """defines a validating HTML parser used in web application tests""" |
18 """defines a validating HTML parser used in web application tests""" |
19 |
19 |
20 import re |
20 import re |
21 import sys |
21 import sys |
|
22 from xml import sax |
|
23 from cStringIO import StringIO |
22 |
24 |
23 from lxml import etree |
25 from lxml import etree |
24 |
26 |
25 from logilab.common.deprecation import class_deprecated |
27 from logilab.common.deprecation import class_deprecated, class_renamed |
26 |
28 |
27 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE |
29 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE |
28 |
30 |
29 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
31 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
30 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
31 |
33 |
32 ERR_COUNT = 0 |
34 ERR_COUNT = 0 |
33 |
35 |
|
36 _REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S) |
|
37 def _remove_script_tags(data): |
|
38 """Remove the script (usually javascript) tags to help the lxml |
|
39 XMLParser / HTMLParser do their job. Without that, they choke on |
|
40 tags embedded in JS strings. |
|
41 """ |
|
42 # Notice we may want to use lxml cleaner, but it's far too intrusive: |
|
43 # |
|
44 # cleaner = Cleaner(scripts=True, |
|
45 # javascript=False, |
|
46 # comments=False, |
|
47 # style=False, |
|
48 # links=False, |
|
49 # meta=False, |
|
50 # page_structure=False, |
|
51 # processing_instructions=False, |
|
52 # embedded=False, |
|
53 # frames=False, |
|
54 # forms=False, |
|
55 # annoying_tags=False, |
|
56 # remove_tags=(), |
|
57 # remove_unknown_tags=False, |
|
58 # safe_attrs_only=False, |
|
59 # add_nofollow=False) |
|
60 # >>> cleaner.clean_html('<body></body>') |
|
61 # '<span></span>' |
|
62 # >>> cleaner.clean_html('<!DOCTYPE html><body></body>') |
|
63 # '<html><body></body></html>' |
|
64 # >>> cleaner.clean_html('<body><div/></body>') |
|
65 # '<div></div>' |
|
66 # >>> cleaner.clean_html('<html><body><div/><br></body><html>') |
|
67 # '<html><body><div></div><br></body></html>' |
|
68 # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>') |
|
69 # '<html><body><div></div><br><span></span></body></html>' |
|
70 # |
|
71 # using that, we'll miss most actual validation error we want to |
|
72 # catch. For now, use dumb regexp |
|
73 return _REM_SCRIPT_RGX.sub('', data) |
|
74 |
|
75 |
34 class Validator(object): |
76 class Validator(object): |
35 |
77 """ base validator API """ |
36 def parse_string(self, data, sysid=None): |
78 parser = None |
|
79 |
|
80 def parse_string(self, source): |
|
81 etree = self._parse(self.preprocess_data(source)) |
|
82 return PageInfo(source, etree) |
|
83 |
|
84 def preprocess_data(self, data): |
|
85 return data |
|
86 |
|
87 def _parse(self, pdata): |
37 try: |
88 try: |
38 data = self.preprocess_data(data) |
89 return etree.fromstring(pdata, self.parser) |
39 return PageInfo(data, etree.fromstring(data, self.parser)) |
|
40 except etree.XMLSyntaxError as exc: |
90 except etree.XMLSyntaxError as exc: |
41 def save_in(fname=''): |
91 def save_in(fname=''): |
42 file(fname, 'w').write(data) |
92 file(fname, 'w').write(data) |
43 new_exc = AssertionError(u'invalid xml %s' % exc) |
93 new_exc = AssertionError(u'invalid document: %s' % exc) |
44 new_exc.position = exc.position |
94 new_exc.position = exc.position |
45 raise new_exc |
95 raise new_exc |
46 |
|
47 def preprocess_data(self, data): |
|
48 return data |
|
49 |
96 |
50 |
97 |
51 class DTDValidator(Validator): |
98 class DTDValidator(Validator): |
52 def __init__(self): |
99 def __init__(self): |
53 Validator.__init__(self) |
100 Validator.__init__(self) |
58 """used to fix potential blockquote mess generated by docutils""" |
105 """used to fix potential blockquote mess generated by docutils""" |
59 if STRICT_DOCTYPE not in data: |
106 if STRICT_DOCTYPE not in data: |
60 return data |
107 return data |
61 # parse using transitional DTD |
108 # parse using transitional DTD |
62 data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) |
109 data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) |
63 tree = etree.fromstring(data, self.parser) |
110 tree = self._parse(data) |
64 namespace = tree.nsmap.get(None) |
111 namespace = tree.nsmap.get(None) |
65 # this is the list of authorized child tags for <blockquote> nodes |
112 # this is the list of authorized child tags for <blockquote> nodes |
66 expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
113 expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
67 'fieldset table form noscript ins del script'.split() |
114 'fieldset table form noscript ins del script'.split() |
68 if namespace: |
115 if namespace: |
77 data = etree.tostring(tree) |
124 data = etree.tostring(tree) |
78 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % ( |
125 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % ( |
79 STRICT_DOCTYPE, data) |
126 STRICT_DOCTYPE, data) |
80 |
127 |
81 |
128 |
82 class SaxOnlyValidator(Validator): |
129 class XMLValidator(Validator): |
|
130 """XML validator, checks that XML is well-formed and used XMLNS are defined""" |
83 |
131 |
84 def __init__(self): |
132 def __init__(self): |
85 Validator.__init__(self) |
133 Validator.__init__(self) |
86 self.parser = etree.XMLParser() |
134 self.parser = etree.XMLParser() |
87 |
135 |
88 |
136 SaxOnlyValidator = class_renamed('SaxOnlyValidator', |
89 class XMLDemotingValidator(SaxOnlyValidator): |
137 XMLValidator, |
|
138 '[3.17] you should use the ' |
|
139 'XMLValidator class instead') |
|
140 |
|
141 |
|
142 class XMLSyntaxValidator(Validator): |
|
143 """XML syntax validator, check XML is well-formed""" |
|
144 |
|
145 class MySaxErrorHandler(sax.ErrorHandler): |
|
146 """override default handler to avoid choking because of unknown entity""" |
|
147 def fatalError(self, exception): |
|
148 # XXX check entity in htmlentitydefs |
|
149 if not str(exception).endswith('undefined entity'): |
|
150 raise exception |
|
151 _parser = sax.make_parser() |
|
152 _parser.setContentHandler(sax.handler.ContentHandler()) |
|
153 _parser.setErrorHandler(MySaxErrorHandler()) |
|
154 |
|
155 def __init__(self): |
|
156 super(XMLSyntaxValidator, self).__init__() |
|
157 # XMLParser() wants xml namespaces defined |
|
158 # XMLParser(recover=True) will accept almost anything |
|
159 # |
|
160 # -> use the later but preprocess will check xml well-formness using a |
|
161 # dumb SAX parser |
|
162 self.parser = etree.XMLParser(recover=True) |
|
163 |
|
164 def preprocess_data(self, data): |
|
165 return _remove_script_tags(data) |
|
166 |
|
167 def _parse(self, data): |
|
168 inpsrc = sax.InputSource() |
|
169 inpsrc.setByteStream(StringIO(data)) |
|
170 try: |
|
171 self._parser.parse(inpsrc) |
|
172 except sax.SAXParseException, exc: |
|
173 new_exc = AssertionError(u'invalid document: %s' % exc) |
|
174 new_exc.position = (exc._linenum, exc._colnum) |
|
175 raise new_exc |
|
176 return super(XMLSyntaxValidator, self)._parse(data) |
|
177 |
|
178 |
|
179 class XMLDemotingValidator(XMLValidator): |
90 """ some views produce html instead of xhtml, using demote_to_html |
180 """ some views produce html instead of xhtml, using demote_to_html |
91 |
181 |
92 this is typically related to the use of external dependencies |
182 this is typically related to the use of external dependencies |
93 which do not produce valid xhtml (google maps, ...) |
183 which do not produce valid xhtml (google maps, ...) |
94 """ |
184 """ |
95 __metaclass__ = class_deprecated |
185 __metaclass__ = class_deprecated |
|
186 __deprecation_warning__ = '[3.10] this is now handled in testlib.py' |
96 |
187 |
97 def preprocess_data(self, data): |
188 def preprocess_data(self, data): |
98 if data.startswith('<?xml'): |
189 if data.startswith('<?xml'): |
99 self.parser = etree.XMLParser() |
190 self.parser = etree.XMLParser() |
100 else: |
191 else: |
104 |
195 |
105 class HTMLValidator(Validator): |
196 class HTMLValidator(Validator): |
106 |
197 |
107 def __init__(self): |
198 def __init__(self): |
108 Validator.__init__(self) |
199 Validator.__init__(self) |
109 self.parser = etree.HTMLParser() |
200 self.parser = etree.HTMLParser(recover=False) |
110 |
201 |
|
202 def preprocess_data(self, data): |
|
203 return _remove_script_tags(data) |
111 |
204 |
112 |
205 |
113 class PageInfo(object): |
206 class PageInfo(object): |
114 """holds various informations on the view's output""" |
207 """holds various informations on the view's output""" |
115 def __init__(self, source, root): |
208 def __init__(self, source, root): |
116 self.source = source |
209 self.source = source |
117 self.etree = root |
210 self.etree = root |
118 self.source = source |
|
119 self.raw_text = u''.join(root.xpath('//text()')) |
211 self.raw_text = u''.join(root.xpath('//text()')) |
120 self.namespace = self.etree.nsmap |
212 self.namespace = self.etree.nsmap |
121 self.default_ns = self.namespace.get(None) |
213 self.default_ns = self.namespace.get(None) |
122 self.a_tags = self.find_tag('a') |
214 self.a_tags = self.find_tag('a') |
123 self.h1_tags = self.find_tag('h1') |
215 self.h1_tags = self.find_tag('h1') |