update note stable
authorSylvain Thénault <sylvain.thenault@logilab.fr>
Fri, 05 Nov 2010 17:00:41 +0100
branchstable
changeset 6688 51ddb4842c56
parent 6687 0b1f5c14646e
child 6689 b00f31b3b045
child 6690 7d68948015ba
update note
uilib.py
--- a/uilib.py	Fri Nov 05 15:08:24 2010 +0100
+++ b/uilib.py	Fri Nov 05 17:00:41 2010 +0100
@@ -147,11 +147,9 @@
         # remove spurious </body> and </html> tags, then normalize line break
         # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
         data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
-        # XXX lxml 1.1 support still needed ?
         xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
-        # NOTE: lxml 1.1 (etch platforms) doesn't recognize
-        #       the encoding=unicode parameter (lxml 2.0 does), this is
-        #       why we specify an encoding and re-decode to unicode later
+        # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
+        # tried I got weird results (lxml 2.2.8)
         body = etree.tostring(xmltree[0], encoding=encoding)
         # remove <body> and </body> and decode to unicode
         snippet = body[6:-7].decode(encoding)