145 """tidy html soup by allowing some element tags and return the result |
145 """tidy html soup by allowing some element tags and return the result |
146 """ |
146 """ |
147 # remove spurious </body> and </html> tags, then normalize line break |
147 # remove spurious </body> and </html> tags, then normalize line break |
148 # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) |
148 # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) |
149 data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) |
149 data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) |
150 # XXX lxml 1.1 support still needed ? |
|
151 xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data)) |
150 xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data)) |
152 # NOTE: lxml 1.1 (etch platforms) doesn't recognize |
151 # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt) |
153 # the encoding=unicode parameter (lxml 2.0 does), this is |
152 # tried I got weird results (lxml 2.2.8) |
154 # why we specify an encoding and re-decode to unicode later |
|
155 body = etree.tostring(xmltree[0], encoding=encoding) |
153 body = etree.tostring(xmltree[0], encoding=encoding) |
156 # remove <body> and </body> and decode to unicode |
154 # remove <body> and </body> and decode to unicode |
157 snippet = body[6:-7].decode(encoding) |
155 snippet = body[6:-7].decode(encoding) |
158 # take care to bad xhtml (for instance starting with </div>) which |
156 # take care to bad xhtml (for instance starting with </div>) which |
159 # may mess with the <div> we added below. Only remove it if it's |
157 # may mess with the <div> we added below. Only remove it if it's |