17 # with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
17 # with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
18 """defines a validating HTML parser used in web application tests""" |
18 """defines a validating HTML parser used in web application tests""" |
19 |
19 |
20 import re |
20 import re |
21 import sys |
21 import sys |
|
22 from xml import sax |
|
23 from cStringIO import StringIO |
22 |
24 |
23 from lxml import etree |
25 from lxml import etree |
24 |
26 |
25 from logilab.common.deprecation import class_deprecated, class_renamed |
27 from logilab.common.deprecation import class_deprecated, class_renamed |
26 |
28 |
29 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
31 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
30 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
31 |
33 |
32 ERR_COUNT = 0 |
34 ERR_COUNT = 0 |
33 |
35 |
|
36 _REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S) |
|
37 def _remove_script_tags(data): |
|
38 """Remove the script (usually javascript) tags to help the lxml |
|
39 XMLParser / HTMLParser do their job. Without that, they choke on |
|
40 tags embedded in JS strings. |
|
41 """ |
|
42 # Notice we may want to use lxml cleaner, but it's far too intrusive: |
|
43 # |
|
44 # cleaner = Cleaner(scripts=True, |
|
45 # javascript=False, |
|
46 # comments=False, |
|
47 # style=False, |
|
48 # links=False, |
|
49 # meta=False, |
|
50 # page_structure=False, |
|
51 # processing_instructions=False, |
|
52 # embedded=False, |
|
53 # frames=False, |
|
54 # forms=False, |
|
55 # annoying_tags=False, |
|
56 # remove_tags=(), |
|
57 # remove_unknown_tags=False, |
|
58 # safe_attrs_only=False, |
|
59 # add_nofollow=False) |
|
60 # >>> cleaner.clean_html('<body></body>') |
|
61 # '<span></span>' |
|
62 # >>> cleaner.clean_html('<!DOCTYPE html><body></body>') |
|
63 # '<html><body></body></html>' |
|
64 # >>> cleaner.clean_html('<body><div/></body>') |
|
65 # '<div></div>' |
|
66 # >>> cleaner.clean_html('<html><body><div/><br></body><html>') |
|
67 # '<html><body><div></div><br></body></html>' |
|
68 # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>') |
|
69 # '<html><body><div></div><br><span></span></body></html>' |
|
70 # |
|
71 # using that, we'll miss most actual validation error we want to |
|
72 # catch. For now, use dumb regexp |
|
73 return _REM_SCRIPT_RGX.sub('', data) |
|
74 |
|
75 |
34 class Validator(object): |
76 class Validator(object): |
|
77 """ base validator API """ |
35 parser = None |
78 parser = None |
36 |
79 |
37 def parse_string(self, source): |
80 def parse_string(self, source): |
38 etree = self._parse(self.preprocess_data(source)) |
81 etree = self._parse(self.preprocess_data(source)) |
39 return PageInfo(source, etree) |
82 return PageInfo(source, etree) |
82 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % ( |
125 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % ( |
83 STRICT_DOCTYPE, data) |
126 STRICT_DOCTYPE, data) |
84 |
127 |
85 |
128 |
86 class XMLValidator(Validator): |
129 class XMLValidator(Validator): |
87 """ A fully compliant XML parser """ |
130 """XML validator, checks that XML is well-formed and used XMLNS are defined""" |
88 |
131 |
89 def __init__(self): |
132 def __init__(self): |
90 Validator.__init__(self) |
133 Validator.__init__(self) |
91 self.parser = etree.XMLParser() |
134 self.parser = etree.XMLParser() |
92 |
135 |
93 SaxOnlyValidator = class_renamed('SaxOnlyValidator', |
136 SaxOnlyValidator = class_renamed('SaxOnlyValidator', |
94 XMLValidator, |
137 XMLValidator, |
95 '[3.17] you should use the ' |
138 '[3.17] you should use the ' |
96 'XMLValidator class instead') |
139 'XMLValidator class instead') |
|
140 |
|
141 |
|
142 class XMLSyntaxValidator(Validator): |
|
143 """XML syntax validator, check XML is well-formed""" |
|
144 |
|
145 class MySaxErrorHandler(sax.ErrorHandler): |
|
146 """override default handler to avoid choking because of unknown entity""" |
|
147 def fatalError(self, exception): |
|
148 # XXX check entity in htmlentitydefs |
|
149 if not str(exception).endswith('undefined entity'): |
|
150 raise exception |
|
151 _parser = sax.make_parser() |
|
152 _parser.setContentHandler(sax.handler.ContentHandler()) |
|
153 _parser.setErrorHandler(MySaxErrorHandler()) |
|
154 |
|
155 def __init__(self): |
|
156 super(XMLSyntaxValidator, self).__init__() |
|
157 # XMLParser() wants xml namespaces defined |
|
158 # XMLParser(recover=True) will accept almost anything |
|
159 # |
|
160 # -> use the later but preprocess will check xml well-formness using a |
|
161 # dumb SAX parser |
|
162 self.parser = etree.XMLParser(recover=True) |
|
163 |
|
164 def preprocess_data(self, data): |
|
165 return _remove_script_tags(data) |
|
166 |
|
167 def _parse(self, data): |
|
168 inpsrc = sax.InputSource() |
|
169 inpsrc.setByteStream(StringIO(data)) |
|
170 try: |
|
171 self._parser.parse(inpsrc) |
|
172 except sax.SAXParseException, exc: |
|
173 new_exc = AssertionError(u'invalid document: %s' % exc) |
|
174 new_exc.position = (exc._linenum, exc._colnum) |
|
175 raise new_exc |
|
176 return super(XMLSyntaxValidator, self)._parse(data) |
|
177 |
97 |
178 |
98 class XMLDemotingValidator(XMLValidator): |
179 class XMLDemotingValidator(XMLValidator): |
99 """ some views produce html instead of xhtml, using demote_to_html |
180 """ some views produce html instead of xhtml, using demote_to_html |
100 |
181 |
101 this is typically related to the use of external dependencies |
182 this is typically related to the use of external dependencies |
110 else: |
191 else: |
111 self.parser = etree.HTMLParser() |
192 self.parser = etree.HTMLParser() |
112 return data |
193 return data |
113 |
194 |
114 |
195 |
115 REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S) |
|
116 |
|
117 class HTMLValidator(Validator): |
196 class HTMLValidator(Validator): |
118 |
197 |
119 def __init__(self): |
198 def __init__(self): |
120 Validator.__init__(self) |
199 Validator.__init__(self) |
121 self.parser = etree.HTMLParser(recover=False) |
200 self.parser = etree.HTMLParser(recover=False) |
122 |
201 |
123 def preprocess_data(self, data): |
202 def preprocess_data(self, data): |
124 """ Here we essentially wipe the javascript tags to help the HTMLParser |
203 return _remove_script_tags(data) |
125 do its job. Without that, it chokes on tags embedded in JS strings. |
|
126 """ |
|
127 # Notice we may want to use lxml cleaner, but it's far too intrusive: |
|
128 # |
|
129 # cleaner = Cleaner(scripts=True, |
|
130 # javascript=False, |
|
131 # comments=False, |
|
132 # style=False, |
|
133 # links=False, |
|
134 # meta=False, |
|
135 # page_structure=False, |
|
136 # processing_instructions=False, |
|
137 # embedded=False, |
|
138 # frames=False, |
|
139 # forms=False, |
|
140 # annoying_tags=False, |
|
141 # remove_tags=(), |
|
142 # remove_unknown_tags=False, |
|
143 # safe_attrs_only=False, |
|
144 # add_nofollow=False) |
|
145 # >>> cleaner.clean_html('<body></body>') |
|
146 # '<span></span>' |
|
147 # >>> cleaner.clean_html('<!DOCTYPE html><body></body>') |
|
148 # '<html><body></body></html>' |
|
149 # >>> cleaner.clean_html('<body><div/></body>') |
|
150 # '<div></div>' |
|
151 # >>> cleaner.clean_html('<html><body><div/><br></body><html>') |
|
152 # '<html><body><div></div><br></body></html>' |
|
153 # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>') |
|
154 # '<html><body><div></div><br><span></span></body></html>' |
|
155 # |
|
156 # using that, we'll miss most actual validation error we want to |
|
157 # catch. For now, use dumb regexp |
|
158 return REM_SCRIPT_RGX.sub('', data) |
|
159 |
204 |
160 |
205 |
161 class PageInfo(object): |
206 class PageInfo(object): |
162 """holds various informations on the view's output""" |
207 """holds various informations on the view's output""" |
163 def __init__(self, source, root): |
208 def __init__(self, source, root): |