uilib.py
branchstable
changeset 6685 eeedb3575d25
parent 6683 68cfebd3b9f3
child 6688 51ddb4842c56
--- a/uilib.py	Fri Nov 05 14:28:22 2010 +0100
+++ b/uilib.py	Fri Oct 22 16:14:48 2010 +0200
@@ -109,12 +109,6 @@
         return u''
     return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text)
 
-# fallback implementation, nicer one defined below if lxml is available
-def soup2xhtml(data, encoding):
-    # normalize line break
-    # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-    return u'\n'.join(data.splitlines())
-
 # fallback implementation, nicer one defined below if lxml> 2.0 is available
 def safe_cut(text, length):
     """returns a string of length <length> based on <text>, removing any html
@@ -132,24 +126,29 @@
 fallback_safe_cut = safe_cut
 
 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
+
 try:
-    from lxml import etree
-except (ImportError, AttributeError):
-    # gae environment: lxml not available
-    pass
-else:
+    from lxml import etree, html
+    from lxml.html import clean, defs
+
+    ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
+                    defs.phrase_tags | defs.font_style_tags |
+                    set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
+                    )
+
+    CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
+                            style=True, safe_attrs_only=True,
+                            add_nofollow=False,
+                            )
 
     def soup2xhtml(data, encoding):
-        """tidy (at least try) html soup and return the result
-
-        Note: the function considers a string with no surrounding tag as valid
-              if <div>`data`</div> can be parsed by an XML parser
+        """tidy html soup by allowing some element tags and return the result
         """
         # remove spurious </body> and </html> tags, then normalize line break
         # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
         data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
         # XXX lxml 1.1 support still needed ?
-        xmltree = etree.HTML('<div>%s</div>' % data)
+        xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
         # NOTE: lxml 1.1 (etch platforms) doesn't recognize
         #       the encoding=unicode parameter (lxml 2.0 does), this is
         #       why we specify an encoding and re-decode to unicode later
@@ -163,7 +162,29 @@
             snippet = snippet[5:-6]
         return snippet
 
-    if hasattr(etree.HTML('<div>test</div>'), 'iter'):
+        # lxml.Cleaner envelops text elements by internal logic (not accessible)
+        # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+        # TODO drop attributes in elements
+        # TODO add policy configuration (content only, embedded content, ...)
+        # XXX this is buggy for "<p>text1</p><p>text2</p>"...
+        # XXX drop these two snippets action and follow the lxml behaviour
+        # XXX (tests need to be updated)
+        # if snippet.startswith('<div>') and snippet.endswith('</div>'):
+        #     snippet = snippet[5:-6]
+        # if snippet.startswith('<p>') and snippet.endswith('</p>'):
+        #     snippet = snippet[3:-4]
+        return snippet.decode(encoding)
+
+except (ImportError, AttributeError):
+    # gae environment: lxml not available
+    # fallback implementation
+    def soup2xhtml(data, encoding):
+        # normalize line break
+        # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+        return u'\n'.join(data.splitlines())
+else:
+
+    if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
 
         def safe_cut(text, length):
             """returns an html document of length <length> based on <text>,