[uilib] soup2xhtml uses now lxml.html.Cleaner stable
authorJulien Jehannet <julien.jehannet@logilab.fr>
Fri, 22 Oct 2010 16:14:48 +0200
branchstable
changeset 6685 eeedb3575d25
parent 6684 b8bd0ecced2e
child 6686 070e3b23160d
[uilib] soup2xhtml uses now lxml.html.Cleaner The lxm cleaner class let you configure more easily the allowed tag elements in parsed content. Caveats: - attributes in elements are not dropped (and html layout can be broken if css classes are reused) - root tag element in snippet is buggy
test/unittest_entity.py
test/unittest_uilib.py
uilib.py
--- a/test/unittest_entity.py	Fri Nov 05 14:28:22 2010 +0100
+++ b/test/unittest_entity.py	Fri Oct 22 16:14:48 2010 +0200
@@ -24,6 +24,8 @@
 from cubicweb.devtools.testlib import CubicWebTC
 from cubicweb.mttransforms import HAS_TAL
 from cubicweb.entities import fetch_config
+from cubicweb.uilib import soup2xhtml
+
 
 class EntityTC(CubicWebTC):
 
@@ -412,24 +414,14 @@
         self.assertEqual(e.printable_value('content'), u'hop\nhop\nhip\nmomo')
 
     def test_printable_value_bad_html_ms(self):
-        self.skipTest('fix soup2xhtml to handle this test')
         req = self.request()
         e = req.create_entity('Card', title=u'bad html', content=u'<div>R&D<br>',
                             content_format=u'text/html')
         tidy = lambda x: x.replace('\n', '')
         e.cw_attr_cache['content'] = u'<div x:foo="bar">ms orifice produces weird html</div>'
-        self.assertEqual(tidy(e.printable_value('content')),
-                          u'<div>ms orifice produces weird html</div>')
-        import tidy as tidymod # apt-get install python-tidy
-        tidy = lambda x: str(tidymod.parseString(x.encode('utf-8'),
-                                                 **{'drop_proprietary_attributes': True,
-                                                    'output_xhtml': True,
-                                                    'show_body_only' : True,
-                                                    'quote-nbsp' : False,
-                                                    'char_encoding' : 'utf8'})).decode('utf-8').strip()
-        self.assertEqual(tidy(e.printable_value('content')),
-                          u'<div>ms orifice produces weird html</div>')
-
+        # Caution! current implementation of soup2xhtml strips first div element
+        content = soup2xhtml(e.printable_value('content'), 'utf-8')
+        self.assertMultiLineEqual(content, u'<div>ms orifice produces weird html</div>')
 
     def test_fulltextindex(self):
         e = self.vreg['etypes'].etype_class('File')(self.request())
--- a/test/unittest_uilib.py	Fri Nov 05 14:28:22 2010 +0100
+++ b/test/unittest_uilib.py	Fri Oct 22 16:14:48 2010 +0200
@@ -20,10 +20,16 @@
 
 __docformat__ = "restructuredtext en"
 
+
+import pkg_resources
 from logilab.common.testlib import TestCase, unittest_main
+from unittest2 import skipIf
 
 from cubicweb import uilib
 
+lxml_version = pkg_resources.get_distribution('lxml').version.split('.')
+
+
 class UILIBTC(TestCase):
 
     def test_remove_tags(self):
@@ -91,7 +97,15 @@
             got = uilib.text_cut(text, 30)
             self.assertEqual(got, expected)
 
+    def test_soup2xhtml_0(self):
+        self.assertEqual(uilib.soup2xhtml('hop\r\nhop', 'ascii'),
+                          'hop\nhop')
+
     def test_soup2xhtml_1_1(self):
+        self.assertEqual(uilib.soup2xhtml('hop', 'ascii'),
+                          'hop')
+        self.assertEqual(uilib.soup2xhtml('hop<div>', 'ascii'),
+                          'hop<div/>')
         self.assertEqual(uilib.soup2xhtml('hop <div>', 'ascii'),
                           'hop <div/>')
         self.assertEqual(uilib.soup2xhtml('<div> hop', 'ascii'),
@@ -115,11 +129,14 @@
         self.assertEqual(uilib.soup2xhtml('hop <body> hop', 'ascii'),
                           'hop  hop')
 
-    def test_soup2xhtml_2_2(self):
+    def test_soup2xhtml_2_2a(self):
         self.assertEqual(uilib.soup2xhtml('hop </body>', 'ascii'),
                           'hop ')
         self.assertEqual(uilib.soup2xhtml('</body> hop', 'ascii'),
                           ' hop')
+
+    @skipIf(lxml_version < ['2', '2'], 'expected behaviour on recent version of lxml only')
+    def test_soup2xhtml_2_2b(self):
         self.assertEqual(uilib.soup2xhtml('hop </body> hop', 'ascii'),
                           'hop  hop')
 
@@ -139,6 +156,10 @@
         self.assertEqual(uilib.soup2xhtml('hop </html> hop', 'ascii'),
                           'hop  hop')
 
+    def test_soup2xhtml_3_3(self):
+        self.assertEqual(uilib.soup2xhtml('<script>test</script> hop ', 'ascii'),
+                          ' hop ')
+
     def test_js(self):
         self.assertEqual(str(uilib.js.pouet(1, "2")),
                           'pouet(1,"2")')
@@ -147,6 +168,23 @@
         self.assertEqual(str(uilib.js.cw.pouet(1, "2").pouet(None)),
                           'cw.pouet(1,"2").pouet(null)')
 
+    def test_embedded_css(self):
+        incoming = u"""voir le ticket <style type="text/css">@font-face { font-family: "Cambria"; }p.MsoNormal, li.MsoNormal, div.MsoNormal { margin: 0cm 0cm 10pt; font-size: 12pt; font-family: "Times New Roman"; }a:link, span.MsoHyperlink { color: blue; text-decoration: underline; }a:visited, span.MsoHyperlinkFollowed { color: purple; text-decoration: underline; }div.Section1 { page: Section1; }</style></p><p class="MsoNormal">text</p>"""
+        expected = 'voir le ticket <p class="MsoNormal">text</p>'
+        self.assertMultiLineEqual(uilib.soup2xhtml(incoming, 'ascii'), expected)
+
+    def test_unknown_namespace(self):
+        incoming = '''<table cellspacing="0" cellpadding="0" width="81" border="0" x:str="" style="width: 61pt; border-collapse: collapse">
+<colgroup><col width="81" style="width: 61pt; mso-width-source: userset; mso-width-alt: 2962"/></colgroup>
+<tbody><tr height="17" style="height: 12.75pt"><td width="81" height="17" style="border-right: #e0dfe3; border-top: #e0dfe3; border-left: #e0dfe3; width: 61pt; border-bottom: #e0dfe3; height: 12.75pt; background-color: transparent"><font size="2">XXXXXXX</font></td></tr></tbody>
+</table>'''
+        expected = '''<table cellspacing="0" cellpadding="0" width="81" border="0">\
+<colgroup><col width="81"/></colgroup>\
+<tbody><tr height="17"><td width="81" height="17">XXXXXXX</td></tr></tbody>\
+</table>'''
+        self.assertMultiLineEqual(uilib.soup2xhtml(incoming, 'ascii'), expected)
+
+
 if __name__ == '__main__':
     unittest_main()
 
--- a/uilib.py	Fri Nov 05 14:28:22 2010 +0100
+++ b/uilib.py	Fri Oct 22 16:14:48 2010 +0200
@@ -109,12 +109,6 @@
         return u''
     return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text)
 
-# fallback implementation, nicer one defined below if lxml is available
-def soup2xhtml(data, encoding):
-    # normalize line break
-    # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-    return u'\n'.join(data.splitlines())
-
 # fallback implementation, nicer one defined below if lxml> 2.0 is available
 def safe_cut(text, length):
     """returns a string of length <length> based on <text>, removing any html
@@ -132,24 +126,29 @@
 fallback_safe_cut = safe_cut
 
 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
+
 try:
-    from lxml import etree
-except (ImportError, AttributeError):
-    # gae environment: lxml not available
-    pass
-else:
+    from lxml import etree, html
+    from lxml.html import clean, defs
+
+    ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
+                    defs.phrase_tags | defs.font_style_tags |
+                    set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
+                    )
+
+    CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
+                            style=True, safe_attrs_only=True,
+                            add_nofollow=False,
+                            )
 
     def soup2xhtml(data, encoding):
-        """tidy (at least try) html soup and return the result
-
-        Note: the function considers a string with no surrounding tag as valid
-              if <div>`data`</div> can be parsed by an XML parser
+        """tidy html soup by allowing some element tags and return the result
         """
         # remove spurious </body> and </html> tags, then normalize line break
         # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
         data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
         # XXX lxml 1.1 support still needed ?
-        xmltree = etree.HTML('<div>%s</div>' % data)
+        xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
         # NOTE: lxml 1.1 (etch platforms) doesn't recognize
         #       the encoding=unicode parameter (lxml 2.0 does), this is
         #       why we specify an encoding and re-decode to unicode later
@@ -163,7 +162,29 @@
             snippet = snippet[5:-6]
         return snippet
 
-    if hasattr(etree.HTML('<div>test</div>'), 'iter'):
+        # lxml.Cleaner envelops text elements by internal logic (not accessible)
+        # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+        # TODO drop attributes in elements
+        # TODO add policy configuration (content only, embedded content, ...)
+        # XXX this is buggy for "<p>text1</p><p>text2</p>"...
+        # XXX drop these two snippets action and follow the lxml behaviour
+        # XXX (tests need to be updated)
+        # if snippet.startswith('<div>') and snippet.endswith('</div>'):
+        #     snippet = snippet[5:-6]
+        # if snippet.startswith('<p>') and snippet.endswith('</p>'):
+        #     snippet = snippet[3:-4]
+        return snippet.decode(encoding)
+
+except (ImportError, AttributeError):
+    # gae environment: lxml not available
+    # fallback implementation
+    def soup2xhtml(data, encoding):
+        # normalize line break
+        # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+        return u'\n'.join(data.splitlines())
+else:
+
+    if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
 
         def safe_cut(text, length):
             """returns an html document of length <length> based on <text>,