# HG changeset patch # User Sylvain Thénault # Date 1276248961 -7200 # Node ID 784025c15a3cba71b878f63fc89584936d451a03 # Parent b00cf7fbff31127adb653adf1649d7a80c9445ed [xhtml] fix soup2xhtml to deal with malformed div,body and html tags which may leads to malformed return value diff -r b00cf7fbff31 -r 784025c15a3c test/unittest_uilib.py --- a/test/unittest_uilib.py Fri Jun 11 09:20:38 2010 +0200 +++ b/test/unittest_uilib.py Fri Jun 11 11:36:01 2010 +0200 @@ -94,6 +94,54 @@ got = uilib.text_cut(text, 30) self.assertEquals(got, expected) + def test_soup2xhtml_1_1(self): + self.assertEquals(uilib.soup2xhtml('hop
', 'ascii'), + 'hop
') + self.assertEquals(uilib.soup2xhtml('
hop', 'ascii'), + '
hop
') + self.assertEquals(uilib.soup2xhtml('hop
hop', 'ascii'), + 'hop
hop
') + + def test_soup2xhtml_1_2(self): + self.assertEquals(uilib.soup2xhtml('hop
', 'ascii'), + 'hop ') + self.assertEquals(uilib.soup2xhtml('
hop', 'ascii'), + '
hop') + self.assertEquals(uilib.soup2xhtml('hop
hop', 'ascii'), + '
hop
hop') + + def test_soup2xhtml_2_1(self): + self.assertEquals(uilib.soup2xhtml('hop ', 'ascii'), + 'hop ') + self.assertEquals(uilib.soup2xhtml(' hop', 'ascii'), + ' hop') + self.assertEquals(uilib.soup2xhtml('hop hop', 'ascii'), + 'hop hop') + + def test_soup2xhtml_2_2(self): + self.assertEquals(uilib.soup2xhtml('hop ', 'ascii'), + 'hop ') + self.assertEquals(uilib.soup2xhtml(' hop', 'ascii'), + ' hop') + self.assertEquals(uilib.soup2xhtml('hop hop', 'ascii'), + 'hop hop') + + def test_soup2xhtml_3_1(self): + self.assertEquals(uilib.soup2xhtml('hop ', 'ascii'), + 'hop ') + self.assertEquals(uilib.soup2xhtml(' hop', 'ascii'), + ' hop') + self.assertEquals(uilib.soup2xhtml('hop hop', 'ascii'), + 'hop hop') + + def test_soup2xhtml_3_2(self): + self.assertEquals(uilib.soup2xhtml('hop ', 'ascii'), + 'hop ') + self.assertEquals(uilib.soup2xhtml(' hop', 'ascii'), + ' hop') + self.assertEquals(uilib.soup2xhtml('hop hop', 'ascii'), + 'hop hop') + if __name__ == '__main__': unittest_main() diff -r b00cf7fbff31 -r 784025c15a3c uilib.py --- a/uilib.py Fri Jun 11 09:20:38 2010 +0200 +++ b/uilib.py Fri Jun 11 11:36:01 2010 +0200 @@ -18,9 +18,10 @@ # with CubicWeb. If not, see . """user interface libraries -contains some functions designed to help implementation of cubicweb user interface +contains some functions designed to help implementation of cubicweb user +interface. +""" -""" __docformat__ = "restructuredtext en" import csv @@ -123,7 +124,7 @@ fallback_safe_cut = safe_cut - +REM_ROOT_HTML_TAGS = re.compile('', re.U) try: from lxml import etree except (ImportError, AttributeError): @@ -133,12 +134,13 @@ def soup2xhtml(data, encoding): """tidy (at least try) html soup and return the result + Note: the function considers a string with no surrounding tag as valid if
`data`
can be parsed by an XML parser """ - # normalize line break - # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 - data = u'\n'.join(data.splitlines()) + # remove spurious and tags, then normalize line break + # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) + data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) # XXX lxml 1.1 support still needed ? xmltree = etree.HTML('
%s
' % data) # NOTE: lxml 1.1 (etch platforms) doesn't recognize @@ -146,7 +148,13 @@ # why we specify an encoding and re-decode to unicode later body = etree.tostring(xmltree[0], encoding=encoding) # remove and and decode to unicode - return body[11:-13].decode(encoding) + snippet = body[6:-7].decode(encoding) + # take care to bad xhtml (for instance starting with
) which + # may mess with the
we added below. Only remove it if it's + # still there... + if snippet.startswith('
') and snippet.endswith('
'): + snippet = snippet[5:-6] + return snippet if hasattr(etree.HTML('
test
'), 'iter'):