[xhtml] fix soup2xhtml to deal with malformed div,body and html tags which may leads to malformed return value stable
authorSylvain Thénault <sylvain.thenault@logilab.fr>
Fri, 11 Jun 2010 11:36:01 +0200
branchstable
changeset 5730 784025c15a3c
parent 5729 b00cf7fbff31
child 5731 b12afc4dc5e3
[xhtml] fix soup2xhtml to deal with malformed div,body and html tags which may leads to malformed return value
test/unittest_uilib.py
uilib.py
--- a/test/unittest_uilib.py	Fri Jun 11 09:20:38 2010 +0200
+++ b/test/unittest_uilib.py	Fri Jun 11 11:36:01 2010 +0200
@@ -94,6 +94,54 @@
             got = uilib.text_cut(text, 30)
             self.assertEquals(got, expected)
 
+    def test_soup2xhtml_1_1(self):
+        self.assertEquals(uilib.soup2xhtml('hop <div>', 'ascii'),
+                          'hop <div/>')
+        self.assertEquals(uilib.soup2xhtml('<div> hop', 'ascii'),
+                          '<div> hop</div>')
+        self.assertEquals(uilib.soup2xhtml('hop <div> hop', 'ascii'),
+                          'hop <div> hop</div>')
+
+    def test_soup2xhtml_1_2(self):
+        self.assertEquals(uilib.soup2xhtml('hop </div>', 'ascii'),
+                          'hop ')
+        self.assertEquals(uilib.soup2xhtml('</div> hop', 'ascii'),
+                          '<div/> hop')
+        self.assertEquals(uilib.soup2xhtml('hop </div> hop', 'ascii'),
+                          '<div>hop </div> hop')
+
+    def test_soup2xhtml_2_1(self):
+        self.assertEquals(uilib.soup2xhtml('hop <body>', 'ascii'),
+                          'hop ')
+        self.assertEquals(uilib.soup2xhtml('<body> hop', 'ascii'),
+                          ' hop')
+        self.assertEquals(uilib.soup2xhtml('hop <body> hop', 'ascii'),
+                          'hop  hop')
+
+    def test_soup2xhtml_2_2(self):
+        self.assertEquals(uilib.soup2xhtml('hop </body>', 'ascii'),
+                          'hop ')
+        self.assertEquals(uilib.soup2xhtml('</body> hop', 'ascii'),
+                          ' hop')
+        self.assertEquals(uilib.soup2xhtml('hop </body> hop', 'ascii'),
+                          'hop  hop')
+
+    def test_soup2xhtml_3_1(self):
+        self.assertEquals(uilib.soup2xhtml('hop <html>', 'ascii'),
+                          'hop ')
+        self.assertEquals(uilib.soup2xhtml('<html> hop', 'ascii'),
+                          ' hop')
+        self.assertEquals(uilib.soup2xhtml('hop <html> hop', 'ascii'),
+                          'hop  hop')
+
+    def test_soup2xhtml_3_2(self):
+        self.assertEquals(uilib.soup2xhtml('hop </html>', 'ascii'),
+                          'hop ')
+        self.assertEquals(uilib.soup2xhtml('</html> hop', 'ascii'),
+                          ' hop')
+        self.assertEquals(uilib.soup2xhtml('hop </html> hop', 'ascii'),
+                          'hop  hop')
+
 if __name__ == '__main__':
     unittest_main()
 
--- a/uilib.py	Fri Jun 11 09:20:38 2010 +0200
+++ b/uilib.py	Fri Jun 11 11:36:01 2010 +0200
@@ -18,9 +18,10 @@
 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
 """user interface libraries
 
-contains some functions designed to help implementation of cubicweb user interface
+contains some functions designed to help implementation of cubicweb user
+interface.
+"""
 
-"""
 __docformat__ = "restructuredtext en"
 
 import csv
@@ -123,7 +124,7 @@
 
 fallback_safe_cut = safe_cut
 
-
+REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
 try:
     from lxml import etree
 except (ImportError, AttributeError):
@@ -133,12 +134,13 @@
 
     def soup2xhtml(data, encoding):
         """tidy (at least try) html soup and return the result
+
         Note: the function considers a string with no surrounding tag as valid
               if <div>`data`</div> can be parsed by an XML parser
         """
-        # normalize line break
-        # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-        data = u'\n'.join(data.splitlines())
+        # remove spurious </body> and </html> tags, then normalize line break
+        # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
+        data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
         # XXX lxml 1.1 support still needed ?
         xmltree = etree.HTML('<div>%s</div>' % data)
         # NOTE: lxml 1.1 (etch platforms) doesn't recognize
@@ -146,7 +148,13 @@
         #       why we specify an encoding and re-decode to unicode later
         body = etree.tostring(xmltree[0], encoding=encoding)
         # remove <body> and </body> and decode to unicode
-        return body[11:-13].decode(encoding)
+        snippet = body[6:-7].decode(encoding)
+        # take care to bad xhtml (for instance starting with </div>) which
+        # may mess with the <div> we added below. Only remove it if it's
+        # still there...
+        if snippet.startswith('<div>') and snippet.endswith('</div>'):
+            snippet = snippet[5:-6]
+        return snippet
 
     if hasattr(etree.HTML('<div>test</div>'), 'iter'):