uilib.py
branchstable
changeset 6688 51ddb4842c56
parent 6685 eeedb3575d25
child 7058 ea22892e82d4
equal deleted inserted replaced
6687:0b1f5c14646e 6688:51ddb4842c56
   145         """tidy html soup by allowing some element tags and return the result
   145         """tidy html soup by allowing some element tags and return the result
   146         """
   146         """
   147         # remove spurious </body> and </html> tags, then normalize line break
   147         # remove spurious </body> and </html> tags, then normalize line break
   148         # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
   148         # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
   149         data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
   149         data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
   150         # XXX lxml 1.1 support still needed ?
       
   151         xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
   150         xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
   152         # NOTE: lxml 1.1 (etch platforms) doesn't recognize
   151         # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
   153         #       the encoding=unicode parameter (lxml 2.0 does), this is
   152         # tried I got weird results (lxml 2.2.8)
   154         #       why we specify an encoding and re-decode to unicode later
       
   155         body = etree.tostring(xmltree[0], encoding=encoding)
   153         body = etree.tostring(xmltree[0], encoding=encoding)
   156         # remove <body> and </body> and decode to unicode
   154         # remove <body> and </body> and decode to unicode
   157         snippet = body[6:-7].decode(encoding)
   155         snippet = body[6:-7].decode(encoding)
   158         # take care to bad xhtml (for instance starting with </div>) which
   156         # take care to bad xhtml (for instance starting with </div>) which
   159         # may mess with the <div> we added below. Only remove it if it's
   157         # may mess with the <div> we added below. Only remove it if it's