[uilib] soup2xhtml uses now lxml.html.Cleaner
The lxm cleaner class let you configure more easily the allowed tag elements in parsed content.
Caveats:
- attributes in elements are not dropped (and html layout can be broken if css classes are reused)
- root tag element in snippet is buggy
--- a/test/unittest_entity.py Fri Nov 05 14:28:22 2010 +0100
+++ b/test/unittest_entity.py Fri Oct 22 16:14:48 2010 +0200
@@ -24,6 +24,8 @@
from cubicweb.devtools.testlib import CubicWebTC
from cubicweb.mttransforms import HAS_TAL
from cubicweb.entities import fetch_config
+from cubicweb.uilib import soup2xhtml
+
class EntityTC(CubicWebTC):
@@ -412,24 +414,14 @@
self.assertEqual(e.printable_value('content'), u'hop\nhop\nhip\nmomo')
def test_printable_value_bad_html_ms(self):
- self.skipTest('fix soup2xhtml to handle this test')
req = self.request()
e = req.create_entity('Card', title=u'bad html', content=u'<div>R&D<br>',
content_format=u'text/html')
tidy = lambda x: x.replace('\n', '')
e.cw_attr_cache['content'] = u'<div x:foo="bar">ms orifice produces weird html</div>'
- self.assertEqual(tidy(e.printable_value('content')),
- u'<div>ms orifice produces weird html</div>')
- import tidy as tidymod # apt-get install python-tidy
- tidy = lambda x: str(tidymod.parseString(x.encode('utf-8'),
- **{'drop_proprietary_attributes': True,
- 'output_xhtml': True,
- 'show_body_only' : True,
- 'quote-nbsp' : False,
- 'char_encoding' : 'utf8'})).decode('utf-8').strip()
- self.assertEqual(tidy(e.printable_value('content')),
- u'<div>ms orifice produces weird html</div>')
-
+ # Caution! current implementation of soup2xhtml strips first div element
+ content = soup2xhtml(e.printable_value('content'), 'utf-8')
+ self.assertMultiLineEqual(content, u'<div>ms orifice produces weird html</div>')
def test_fulltextindex(self):
e = self.vreg['etypes'].etype_class('File')(self.request())
--- a/test/unittest_uilib.py Fri Nov 05 14:28:22 2010 +0100
+++ b/test/unittest_uilib.py Fri Oct 22 16:14:48 2010 +0200
@@ -20,10 +20,16 @@
__docformat__ = "restructuredtext en"
+
+import pkg_resources
from logilab.common.testlib import TestCase, unittest_main
+from unittest2 import skipIf
from cubicweb import uilib
+lxml_version = pkg_resources.get_distribution('lxml').version.split('.')
+
+
class UILIBTC(TestCase):
def test_remove_tags(self):
@@ -91,7 +97,15 @@
got = uilib.text_cut(text, 30)
self.assertEqual(got, expected)
+ def test_soup2xhtml_0(self):
+ self.assertEqual(uilib.soup2xhtml('hop\r\nhop', 'ascii'),
+ 'hop\nhop')
+
def test_soup2xhtml_1_1(self):
+ self.assertEqual(uilib.soup2xhtml('hop', 'ascii'),
+ 'hop')
+ self.assertEqual(uilib.soup2xhtml('hop<div>', 'ascii'),
+ 'hop<div/>')
self.assertEqual(uilib.soup2xhtml('hop <div>', 'ascii'),
'hop <div/>')
self.assertEqual(uilib.soup2xhtml('<div> hop', 'ascii'),
@@ -115,11 +129,14 @@
self.assertEqual(uilib.soup2xhtml('hop <body> hop', 'ascii'),
'hop hop')
- def test_soup2xhtml_2_2(self):
+ def test_soup2xhtml_2_2a(self):
self.assertEqual(uilib.soup2xhtml('hop </body>', 'ascii'),
'hop ')
self.assertEqual(uilib.soup2xhtml('</body> hop', 'ascii'),
' hop')
+
+ @skipIf(lxml_version < ['2', '2'], 'expected behaviour on recent version of lxml only')
+ def test_soup2xhtml_2_2b(self):
self.assertEqual(uilib.soup2xhtml('hop </body> hop', 'ascii'),
'hop hop')
@@ -139,6 +156,10 @@
self.assertEqual(uilib.soup2xhtml('hop </html> hop', 'ascii'),
'hop hop')
+ def test_soup2xhtml_3_3(self):
+ self.assertEqual(uilib.soup2xhtml('<script>test</script> hop ', 'ascii'),
+ ' hop ')
+
def test_js(self):
self.assertEqual(str(uilib.js.pouet(1, "2")),
'pouet(1,"2")')
@@ -147,6 +168,23 @@
self.assertEqual(str(uilib.js.cw.pouet(1, "2").pouet(None)),
'cw.pouet(1,"2").pouet(null)')
+ def test_embedded_css(self):
+ incoming = u"""voir le ticket <style type="text/css">@font-face { font-family: "Cambria"; }p.MsoNormal, li.MsoNormal, div.MsoNormal { margin: 0cm 0cm 10pt; font-size: 12pt; font-family: "Times New Roman"; }a:link, span.MsoHyperlink { color: blue; text-decoration: underline; }a:visited, span.MsoHyperlinkFollowed { color: purple; text-decoration: underline; }div.Section1 { page: Section1; }</style></p><p class="MsoNormal">text</p>"""
+ expected = 'voir le ticket <p class="MsoNormal">text</p>'
+ self.assertMultiLineEqual(uilib.soup2xhtml(incoming, 'ascii'), expected)
+
+ def test_unknown_namespace(self):
+ incoming = '''<table cellspacing="0" cellpadding="0" width="81" border="0" x:str="" style="width: 61pt; border-collapse: collapse">
+<colgroup><col width="81" style="width: 61pt; mso-width-source: userset; mso-width-alt: 2962"/></colgroup>
+<tbody><tr height="17" style="height: 12.75pt"><td width="81" height="17" style="border-right: #e0dfe3; border-top: #e0dfe3; border-left: #e0dfe3; width: 61pt; border-bottom: #e0dfe3; height: 12.75pt; background-color: transparent"><font size="2">XXXXXXX</font></td></tr></tbody>
+</table>'''
+ expected = '''<table cellspacing="0" cellpadding="0" width="81" border="0">\
+<colgroup><col width="81"/></colgroup>\
+<tbody><tr height="17"><td width="81" height="17">XXXXXXX</td></tr></tbody>\
+</table>'''
+ self.assertMultiLineEqual(uilib.soup2xhtml(incoming, 'ascii'), expected)
+
+
if __name__ == '__main__':
unittest_main()
--- a/uilib.py Fri Nov 05 14:28:22 2010 +0100
+++ b/uilib.py Fri Oct 22 16:14:48 2010 +0200
@@ -109,12 +109,6 @@
return u''
return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text)
-# fallback implementation, nicer one defined below if lxml is available
-def soup2xhtml(data, encoding):
- # normalize line break
- # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
- return u'\n'.join(data.splitlines())
-
# fallback implementation, nicer one defined below if lxml> 2.0 is available
def safe_cut(text, length):
"""returns a string of length <length> based on <text>, removing any html
@@ -132,24 +126,29 @@
fallback_safe_cut = safe_cut
REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
+
try:
- from lxml import etree
-except (ImportError, AttributeError):
- # gae environment: lxml not available
- pass
-else:
+ from lxml import etree, html
+ from lxml.html import clean, defs
+
+ ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
+ defs.phrase_tags | defs.font_style_tags |
+ set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
+ )
+
+ CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
+ style=True, safe_attrs_only=True,
+ add_nofollow=False,
+ )
def soup2xhtml(data, encoding):
- """tidy (at least try) html soup and return the result
-
- Note: the function considers a string with no surrounding tag as valid
- if <div>`data`</div> can be parsed by an XML parser
+ """tidy html soup by allowing some element tags and return the result
"""
# remove spurious </body> and </html> tags, then normalize line break
# (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
# XXX lxml 1.1 support still needed ?
- xmltree = etree.HTML('<div>%s</div>' % data)
+ xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
# NOTE: lxml 1.1 (etch platforms) doesn't recognize
# the encoding=unicode parameter (lxml 2.0 does), this is
# why we specify an encoding and re-decode to unicode later
@@ -163,7 +162,29 @@
snippet = snippet[5:-6]
return snippet
- if hasattr(etree.HTML('<div>test</div>'), 'iter'):
+ # lxml.Cleaner envelops text elements by internal logic (not accessible)
+ # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+ # TODO drop attributes in elements
+ # TODO add policy configuration (content only, embedded content, ...)
+ # XXX this is buggy for "<p>text1</p><p>text2</p>"...
+ # XXX drop these two snippets action and follow the lxml behaviour
+ # XXX (tests need to be updated)
+ # if snippet.startswith('<div>') and snippet.endswith('</div>'):
+ # snippet = snippet[5:-6]
+ # if snippet.startswith('<p>') and snippet.endswith('</p>'):
+ # snippet = snippet[3:-4]
+ return snippet.decode(encoding)
+
+except (ImportError, AttributeError):
+ # gae environment: lxml not available
+ # fallback implementation
+ def soup2xhtml(data, encoding):
+ # normalize line break
+ # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+ return u'\n'.join(data.splitlines())
+else:
+
+ if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
def safe_cut(text, length):
"""returns an html document of length <length> based on <text>,