106 else: |
106 else: |
107 self.parser = etree.HTMLParser() |
107 self.parser = etree.HTMLParser() |
108 return data |
108 return data |
109 |
109 |
110 |
110 |
|
111 REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S) |
|
112 |
111 class HTMLValidator(Validator): |
113 class HTMLValidator(Validator): |
112 |
114 |
113 def __init__(self): |
115 def __init__(self): |
114 Validator.__init__(self) |
116 Validator.__init__(self) |
115 self.parser = etree.HTMLParser() |
117 self.parser = etree.HTMLParser(recover=False) |
116 |
118 |
|
119 def preprocess_data(self, data): |
|
120 """ Here we essentially wipe the javascript tags to help the HTMLParser |
|
121 do its job. Without that, it chokes on tags embedded in JS strings. |
|
122 """ |
|
123 # Notice we may want to use lxml cleaner, but it's far too intrusive: |
|
124 # |
|
125 # cleaner = Cleaner(scripts=True, |
|
126 # javascript=False, |
|
127 # comments=False, |
|
128 # style=False, |
|
129 # links=False, |
|
130 # meta=False, |
|
131 # page_structure=False, |
|
132 # processing_instructions=False, |
|
133 # embedded=False, |
|
134 # frames=False, |
|
135 # forms=False, |
|
136 # annoying_tags=False, |
|
137 # remove_tags=(), |
|
138 # remove_unknown_tags=False, |
|
139 # safe_attrs_only=False, |
|
140 # add_nofollow=False) |
|
141 # >>> cleaner.clean_html('<body></body>') |
|
142 # '<span></span>' |
|
143 # >>> cleaner.clean_html('<!DOCTYPE html><body></body>') |
|
144 # '<html><body></body></html>' |
|
145 # >>> cleaner.clean_html('<body><div/></body>') |
|
146 # '<div></div>' |
|
147 # >>> cleaner.clean_html('<html><body><div/><br></body><html>') |
|
148 # '<html><body><div></div><br></body></html>' |
|
149 # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>') |
|
150 # '<html><body><div></div><br><span></span></body></html>' |
|
151 # |
|
152 # using that, we'll miss most actual validation error we want to |
|
153 # catch. For now, use dumb regexp |
|
154 return REM_SCRIPT_RGX.sub('', data) |
117 |
155 |
118 |
156 |
119 class PageInfo(object): |
157 class PageInfo(object): |
120 """holds various informations on the view's output""" |
158 """holds various informations on the view's output""" |
121 def __init__(self, validator, source): |
159 def __init__(self, validator, source): |