devtools/htmlparser.py
branchstable
changeset 9013 b4bcabf55e77
parent 8979 8f5416b1562a
child 10006 8391bf718485
--- a/devtools/htmlparser.py	Fri Jun 14 16:13:24 2013 +0200
+++ b/devtools/htmlparser.py	Fri Jun 14 16:26:25 2013 +0200
@@ -1,4 +1,4 @@
-# copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
 #
 # This file is part of CubicWeb.
@@ -19,10 +19,12 @@
 
 import re
 import sys
+from xml import sax
+from cStringIO import StringIO
 
 from lxml import etree
 
-from logilab.common.deprecation import class_deprecated
+from logilab.common.deprecation import class_deprecated, class_renamed
 
 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
 
@@ -31,22 +33,67 @@
 
 ERR_COUNT = 0
 
-class Validator(object):
+_REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
+def _remove_script_tags(data):
+    """Remove the script (usually javascript) tags to help the lxml
+    XMLParser / HTMLParser do their job. Without that, they choke on
+    tags embedded in JS strings.
+    """
+    # Notice we may want to use lxml cleaner, but it's far too intrusive:
+    #
+    # cleaner = Cleaner(scripts=True,
+    #                   javascript=False,
+    #                   comments=False,
+    #                   style=False,
+    #                   links=False,
+    #                   meta=False,
+    #                   page_structure=False,
+    #                   processing_instructions=False,
+    #                   embedded=False,
+    #                   frames=False,
+    #                   forms=False,
+    #                   annoying_tags=False,
+    #                   remove_tags=(),
+    #                   remove_unknown_tags=False,
+    #                   safe_attrs_only=False,
+    #                   add_nofollow=False)
+    # >>> cleaner.clean_html('<body></body>')
+    # '<span></span>'
+    # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
+    # '<html><body></body></html>'
+    # >>> cleaner.clean_html('<body><div/></body>')
+    # '<div></div>'
+    # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
+    # '<html><body><div></div><br></body></html>'
+    # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
+    # '<html><body><div></div><br><span></span></body></html>'
+    #
+    # using that, we'll miss most actual validation error we want to
+    # catch. For now, use dumb regexp
+    return _REM_SCRIPT_RGX.sub('', data)
 
-    def parse_string(self, data, sysid=None):
+
+class Validator(object):
+    """ base validator API """
+    parser = None
+
+    def parse_string(self, source):
+        etree = self._parse(self.preprocess_data(source))
+        return PageInfo(source, etree)
+
+    def preprocess_data(self, data):
+        return data
+
+    def _parse(self, pdata):
         try:
-            data = self.preprocess_data(data)
-            return PageInfo(data, etree.fromstring(data, self.parser))
+            return etree.fromstring(pdata, self.parser)
         except etree.XMLSyntaxError as exc:
             def save_in(fname=''):
                 file(fname, 'w').write(data)
-            new_exc = AssertionError(u'invalid xml %s' % exc)
+            new_exc = AssertionError(u'invalid document: %s' % exc)
             new_exc.position = exc.position
             raise new_exc
 
-    def preprocess_data(self, data):
-        return data
-
 
 class DTDValidator(Validator):
     def __init__(self):
@@ -60,7 +107,7 @@
             return data
         # parse using transitional DTD
         data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
-        tree = etree.fromstring(data, self.parser)
+        tree = self._parse(data)
         namespace = tree.nsmap.get(None)
         # this is the list of authorized child tags for <blockquote> nodes
         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
@@ -79,20 +126,64 @@
             STRICT_DOCTYPE, data)
 
 
-class SaxOnlyValidator(Validator):
+class XMLValidator(Validator):
+    """XML validator, checks that XML is well-formed and used XMLNS are defined"""
 
     def __init__(self):
         Validator.__init__(self)
         self.parser = etree.XMLParser()
 
+SaxOnlyValidator = class_renamed('SaxOnlyValidator',
+                                 XMLValidator,
+                                 '[3.17] you should use the '
+                                 'XMLValidator class instead')
 
-class XMLDemotingValidator(SaxOnlyValidator):
+
+class XMLSyntaxValidator(Validator):
+    """XML syntax validator, check XML is well-formed"""
+
+    class MySaxErrorHandler(sax.ErrorHandler):
+        """override default handler to avoid choking because of unknown entity"""
+        def fatalError(self, exception):
+            # XXX check entity in htmlentitydefs
+            if not str(exception).endswith('undefined entity'):
+                raise exception
+    _parser = sax.make_parser()
+    _parser.setContentHandler(sax.handler.ContentHandler())
+    _parser.setErrorHandler(MySaxErrorHandler())
+
+    def __init__(self):
+        super(XMLSyntaxValidator, self).__init__()
+        # XMLParser() wants xml namespaces defined
+        # XMLParser(recover=True) will accept almost anything
+        #
+        # -> use the later but preprocess will check xml well-formness using a
+        #    dumb SAX parser
+        self.parser = etree.XMLParser(recover=True)
+
+    def preprocess_data(self, data):
+        return _remove_script_tags(data)
+
+    def _parse(self, data):
+        inpsrc = sax.InputSource()
+        inpsrc.setByteStream(StringIO(data))
+        try:
+            self._parser.parse(inpsrc)
+        except sax.SAXParseException, exc:
+            new_exc = AssertionError(u'invalid document: %s' % exc)
+            new_exc.position = (exc._linenum, exc._colnum)
+            raise new_exc
+        return super(XMLSyntaxValidator, self)._parse(data)
+
+
+class XMLDemotingValidator(XMLValidator):
     """ some views produce html instead of xhtml, using demote_to_html
 
     this is typically related to the use of external dependencies
     which do not produce valid xhtml (google maps, ...)
     """
     __metaclass__ = class_deprecated
+    __deprecation_warning__ = '[3.10] this is now handled in testlib.py'
 
     def preprocess_data(self, data):
         if data.startswith('<?xml'):
@@ -106,8 +197,10 @@
 
     def __init__(self):
         Validator.__init__(self)
-        self.parser = etree.HTMLParser()
+        self.parser = etree.HTMLParser(recover=False)
 
+    def preprocess_data(self, data):
+        return _remove_script_tags(data)
 
 
 class PageInfo(object):
@@ -115,7 +208,6 @@
     def __init__(self, source, root):
         self.source = source
         self.etree = root
-        self.source = source
         self.raw_text = u''.join(root.xpath('//text()'))
         self.namespace = self.etree.nsmap
         self.default_ns = self.namespace.get(None)
@@ -234,4 +326,8 @@
                     continue
         return False
 
-VALMAP = {None: None, 'dtd': DTDValidator, 'xml': SaxOnlyValidator}
+VALMAP = {None: None,
+          'dtd': DTDValidator,
+          'xml': XMLValidator,
+          'html': HTMLValidator,
+          }