# HG changeset patch # User Julien Jehannet # Date 1287756888 -7200 # Node ID eeedb3575d2537e01a958db0126f226984edbb49 # Parent b8bd0ecced2e8cb28e2200204a727e443fbc8465 [uilib] soup2xhtml uses now lxml.html.Cleaner The lxm cleaner class let you configure more easily the allowed tag elements in parsed content. Caveats: - attributes in elements are not dropped (and html layout can be broken if css classes are reused) - root tag element in snippet is buggy diff -r b8bd0ecced2e -r eeedb3575d25 test/unittest_entity.py --- a/test/unittest_entity.py Fri Nov 05 14:28:22 2010 +0100 +++ b/test/unittest_entity.py Fri Oct 22 16:14:48 2010 +0200 @@ -24,6 +24,8 @@ from cubicweb.devtools.testlib import CubicWebTC from cubicweb.mttransforms import HAS_TAL from cubicweb.entities import fetch_config +from cubicweb.uilib import soup2xhtml + class EntityTC(CubicWebTC): @@ -412,24 +414,14 @@ self.assertEqual(e.printable_value('content'), u'hop\nhop\nhip\nmomo') def test_printable_value_bad_html_ms(self): - self.skipTest('fix soup2xhtml to handle this test') req = self.request() e = req.create_entity('Card', title=u'bad html', content=u'
R&D
', content_format=u'text/html') tidy = lambda x: x.replace('\n', '') e.cw_attr_cache['content'] = u'
ms orifice produces weird html
' - self.assertEqual(tidy(e.printable_value('content')), - u'
ms orifice produces weird html
') - import tidy as tidymod # apt-get install python-tidy - tidy = lambda x: str(tidymod.parseString(x.encode('utf-8'), - **{'drop_proprietary_attributes': True, - 'output_xhtml': True, - 'show_body_only' : True, - 'quote-nbsp' : False, - 'char_encoding' : 'utf8'})).decode('utf-8').strip() - self.assertEqual(tidy(e.printable_value('content')), - u'
ms orifice produces weird html
') - + # Caution! current implementation of soup2xhtml strips first div element + content = soup2xhtml(e.printable_value('content'), 'utf-8') + self.assertMultiLineEqual(content, u'
ms orifice produces weird html
') def test_fulltextindex(self): e = self.vreg['etypes'].etype_class('File')(self.request()) diff -r b8bd0ecced2e -r eeedb3575d25 test/unittest_uilib.py --- a/test/unittest_uilib.py Fri Nov 05 14:28:22 2010 +0100 +++ b/test/unittest_uilib.py Fri Oct 22 16:14:48 2010 +0200 @@ -20,10 +20,16 @@ __docformat__ = "restructuredtext en" + +import pkg_resources from logilab.common.testlib import TestCase, unittest_main +from unittest2 import skipIf from cubicweb import uilib +lxml_version = pkg_resources.get_distribution('lxml').version.split('.') + + class UILIBTC(TestCase): def test_remove_tags(self): @@ -91,7 +97,15 @@ got = uilib.text_cut(text, 30) self.assertEqual(got, expected) + def test_soup2xhtml_0(self): + self.assertEqual(uilib.soup2xhtml('hop\r\nhop', 'ascii'), + 'hop\nhop') + def test_soup2xhtml_1_1(self): + self.assertEqual(uilib.soup2xhtml('hop', 'ascii'), + 'hop') + self.assertEqual(uilib.soup2xhtml('hop
', 'ascii'), + 'hop
') self.assertEqual(uilib.soup2xhtml('hop
', 'ascii'), 'hop
') self.assertEqual(uilib.soup2xhtml('
hop', 'ascii'), @@ -115,11 +129,14 @@ self.assertEqual(uilib.soup2xhtml('hop hop', 'ascii'), 'hop hop') - def test_soup2xhtml_2_2(self): + def test_soup2xhtml_2_2a(self): self.assertEqual(uilib.soup2xhtml('hop ', 'ascii'), 'hop ') self.assertEqual(uilib.soup2xhtml(' hop', 'ascii'), ' hop') + + @skipIf(lxml_version < ['2', '2'], 'expected behaviour on recent version of lxml only') + def test_soup2xhtml_2_2b(self): self.assertEqual(uilib.soup2xhtml('hop hop', 'ascii'), 'hop hop') @@ -139,6 +156,10 @@ self.assertEqual(uilib.soup2xhtml('hop hop', 'ascii'), 'hop hop') + def test_soup2xhtml_3_3(self): + self.assertEqual(uilib.soup2xhtml(' hop ', 'ascii'), + ' hop ') + def test_js(self): self.assertEqual(str(uilib.js.pouet(1, "2")), 'pouet(1,"2")') @@ -147,6 +168,23 @@ self.assertEqual(str(uilib.js.cw.pouet(1, "2").pouet(None)), 'cw.pouet(1,"2").pouet(null)') + def test_embedded_css(self): + incoming = u"""voir le ticket

text

""" + expected = 'voir le ticket

text

' + self.assertMultiLineEqual(uilib.soup2xhtml(incoming, 'ascii'), expected) + + def test_unknown_namespace(self): + incoming = ''' + + +
XXXXXXX
''' + expected = '''\ +\ +\ +
XXXXXXX
''' + self.assertMultiLineEqual(uilib.soup2xhtml(incoming, 'ascii'), expected) + + if __name__ == '__main__': unittest_main() diff -r b8bd0ecced2e -r eeedb3575d25 uilib.py --- a/uilib.py Fri Nov 05 14:28:22 2010 +0100 +++ b/uilib.py Fri Oct 22 16:14:48 2010 +0200 @@ -109,12 +109,6 @@ return u'' return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text) -# fallback implementation, nicer one defined below if lxml is available -def soup2xhtml(data, encoding): - # normalize line break - # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 - return u'\n'.join(data.splitlines()) - # fallback implementation, nicer one defined below if lxml> 2.0 is available def safe_cut(text, length): """returns a string of length based on , removing any html @@ -132,24 +126,29 @@ fallback_safe_cut = safe_cut REM_ROOT_HTML_TAGS = re.compile('', re.U) + try: - from lxml import etree -except (ImportError, AttributeError): - # gae environment: lxml not available - pass -else: + from lxml import etree, html + from lxml.html import clean, defs + + ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags | + defs.phrase_tags | defs.font_style_tags | + set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup')) + ) + + CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False, + style=True, safe_attrs_only=True, + add_nofollow=False, + ) def soup2xhtml(data, encoding): - """tidy (at least try) html soup and return the result - - Note: the function considers a string with no surrounding tag as valid - if
`data`
can be parsed by an XML parser + """tidy html soup by allowing some element tags and return the result """ # remove spurious and tags, then normalize line break # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) # XXX lxml 1.1 support still needed ? - xmltree = etree.HTML('
%s
' % data) + xmltree = etree.HTML(CLEANER.clean_html('
%s
' % data)) # NOTE: lxml 1.1 (etch platforms) doesn't recognize # the encoding=unicode parameter (lxml 2.0 does), this is # why we specify an encoding and re-decode to unicode later @@ -163,7 +162,29 @@ snippet = snippet[5:-6] return snippet - if hasattr(etree.HTML('
test
'), 'iter'): + # lxml.Cleaner envelops text elements by internal logic (not accessible) + # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 + # TODO drop attributes in elements + # TODO add policy configuration (content only, embedded content, ...) + # XXX this is buggy for "

text1

text2

"... + # XXX drop these two snippets action and follow the lxml behaviour + # XXX (tests need to be updated) + # if snippet.startswith('
') and snippet.endswith('
'): + # snippet = snippet[5:-6] + # if snippet.startswith('

') and snippet.endswith('

'): + # snippet = snippet[3:-4] + return snippet.decode(encoding) + +except (ImportError, AttributeError): + # gae environment: lxml not available + # fallback implementation + def soup2xhtml(data, encoding): + # normalize line break + # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 + return u'\n'.join(data.splitlines()) +else: + + if hasattr(etree.HTML('
test
'), 'iter'): # XXX still necessary? def safe_cut(text, length): """returns an html document of length based on ,