uilib.py
branchstable
changeset 7879 9aae456abab5
parent 7815 2a164a9cf81c
child 7913 d0c6a7993cec
--- a/uilib.py	Tue Sep 27 16:04:30 2011 +0200
+++ b/uilib.py	Wed Sep 28 09:27:42 2011 +0200
@@ -161,94 +161,84 @@
 
 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
 
-try:
-    from lxml import etree, html
-    from lxml.html import clean, defs
+from lxml import etree, html
+from lxml.html import clean, defs
 
-    ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
-                    defs.phrase_tags | defs.font_style_tags |
-                    set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
-                    )
+ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
+                defs.phrase_tags | defs.font_style_tags |
+                set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
+                )
 
-    CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
-                            style=True, safe_attrs_only=True,
-                            add_nofollow=False,
-                            )
+CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
+                        style=True, safe_attrs_only=True,
+                        add_nofollow=False,
+                        )
 
-    def soup2xhtml(data, encoding):
-        """tidy html soup by allowing some element tags and return the result
-        """
-        # remove spurious </body> and </html> tags, then normalize line break
-        # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
-        data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
-        xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
-        # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
-        # tried I got weird results (lxml 2.2.8)
-        body = etree.tostring(xmltree[0], encoding=encoding)
-        # remove <body> and </body> and decode to unicode
-        snippet = body[6:-7].decode(encoding)
-        # take care to bad xhtml (for instance starting with </div>) which
-        # may mess with the <div> we added below. Only remove it if it's
-        # still there...
-        if snippet.startswith('<div>') and snippet.endswith('</div>'):
-            snippet = snippet[5:-6]
-        return snippet
+def soup2xhtml(data, encoding):
+    """tidy html soup by allowing some element tags and return the result
+    """
+    # remove spurious </body> and </html> tags, then normalize line break
+    # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
+    data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
+    xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
+    # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
+    # tried I got weird results (lxml 2.2.8)
+    body = etree.tostring(xmltree[0], encoding=encoding)
+    # remove <body> and </body> and decode to unicode
+    snippet = body[6:-7].decode(encoding)
+    # take care to bad xhtml (for instance starting with </div>) which
+    # may mess with the <div> we added below. Only remove it if it's
+    # still there...
+    if snippet.startswith('<div>') and snippet.endswith('</div>'):
+        snippet = snippet[5:-6]
+    return snippet
 
-        # lxml.Cleaner envelops text elements by internal logic (not accessible)
-        # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-        # TODO drop attributes in elements
-        # TODO add policy configuration (content only, embedded content, ...)
-        # XXX this is buggy for "<p>text1</p><p>text2</p>"...
-        # XXX drop these two snippets action and follow the lxml behaviour
-        # XXX (tests need to be updated)
-        # if snippet.startswith('<div>') and snippet.endswith('</div>'):
-        #     snippet = snippet[5:-6]
-        # if snippet.startswith('<p>') and snippet.endswith('</p>'):
-        #     snippet = snippet[3:-4]
-        return snippet.decode(encoding)
-
-except (ImportError, AttributeError):
-    # gae environment: lxml not available
-    # fallback implementation
-    def soup2xhtml(data, encoding):
-        # normalize line break
-        # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-        return u'\n'.join(data.splitlines())
-else:
-
-    if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
+    # lxml.Cleaner envelops text elements by internal logic (not accessible)
+    # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+    # TODO drop attributes in elements
+    # TODO add policy configuration (content only, embedded content, ...)
+    # XXX this is buggy for "<p>text1</p><p>text2</p>"...
+    # XXX drop these two snippets action and follow the lxml behaviour
+    # XXX (tests need to be updated)
+    # if snippet.startswith('<div>') and snippet.endswith('</div>'):
+    #     snippet = snippet[5:-6]
+    # if snippet.startswith('<p>') and snippet.endswith('</p>'):
+    #     snippet = snippet[3:-4]
+    return snippet.decode(encoding)
 
-        def safe_cut(text, length):
-            """returns an html document of length <length> based on <text>,
-            and cut is necessary.
-            """
-            if text is None:
-                return u''
-            dom = etree.HTML(text)
-            curlength = 0
-            add_ellipsis = False
-            for element in dom.iter():
-                if curlength >= length:
-                    parent = element.getparent()
-                    parent.remove(element)
-                    if curlength == length and (element.text or element.tail):
-                        add_ellipsis = True
-                else:
-                    if element.text is not None:
-                        element.text = cut(element.text, length - curlength)
-                        curlength += len(element.text)
-                    if element.tail is not None:
-                        if curlength < length:
-                            element.tail = cut(element.tail, length - curlength)
-                            curlength += len(element.tail)
-                        elif curlength == length:
-                            element.tail = '...'
-                        else:
-                            element.tail = ''
-            text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body>
-            if add_ellipsis:
-                return text + u'...'
-            return text
+if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
+    # pylint: disable=E0102
+    def safe_cut(text, length):
+        """returns an html document of length <length> based on <text>,
+        and cut is necessary.
+        """
+        if text is None:
+            return u''
+        dom = etree.HTML(text)
+        curlength = 0
+        add_ellipsis = False
+        for element in dom.iter():
+            if curlength >= length:
+                parent = element.getparent()
+                parent.remove(element)
+                if curlength == length and (element.text or element.tail):
+                    add_ellipsis = True
+            else:
+                if element.text is not None:
+                    element.text = cut(element.text, length - curlength)
+                    curlength += len(element.text)
+                if element.tail is not None:
+                    if curlength < length:
+                        element.tail = cut(element.tail, length - curlength)
+                        curlength += len(element.tail)
+                    elif curlength == length:
+                        element.tail = '...'
+                    else:
+                        element.tail = ''
+        text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body>
+        if add_ellipsis:
+            return text + u'...'
+        return text
 
 def text_cut(text, nbwords=30, gotoperiod=True):
     """from the given plain text, return a text with at least <nbwords> words,