devtools/htmlparser.py
branchstable
changeset 9013 b4bcabf55e77
parent 8979 8f5416b1562a
child 10006 8391bf718485
equal deleted inserted replaced
9012:2cf127d4f5fd 9013:b4bcabf55e77
     1 # copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
     1 # copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
     2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
     2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
     3 #
     3 #
     4 # This file is part of CubicWeb.
     4 # This file is part of CubicWeb.
     5 #
     5 #
     6 # CubicWeb is free software: you can redistribute it and/or modify it under the
     6 # CubicWeb is free software: you can redistribute it and/or modify it under the
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
    18 """defines a validating HTML parser used in web application tests"""
    18 """defines a validating HTML parser used in web application tests"""
    19 
    19 
    20 import re
    20 import re
    21 import sys
    21 import sys
       
    22 from xml import sax
       
    23 from cStringIO import StringIO
    22 
    24 
    23 from lxml import etree
    25 from lxml import etree
    24 
    26 
    25 from logilab.common.deprecation import class_deprecated
    27 from logilab.common.deprecation import class_deprecated, class_renamed
    26 
    28 
    27 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
    29 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
    28 
    30 
    29 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
    31 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
    30 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    31 
    33 
    32 ERR_COUNT = 0
    34 ERR_COUNT = 0
    33 
    35 
       
    36 _REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
       
    37 def _remove_script_tags(data):
       
    38     """Remove the script (usually javascript) tags to help the lxml
       
    39     XMLParser / HTMLParser do their job. Without that, they choke on
       
    40     tags embedded in JS strings.
       
    41     """
       
    42     # Notice we may want to use lxml cleaner, but it's far too intrusive:
       
    43     #
       
    44     # cleaner = Cleaner(scripts=True,
       
    45     #                   javascript=False,
       
    46     #                   comments=False,
       
    47     #                   style=False,
       
    48     #                   links=False,
       
    49     #                   meta=False,
       
    50     #                   page_structure=False,
       
    51     #                   processing_instructions=False,
       
    52     #                   embedded=False,
       
    53     #                   frames=False,
       
    54     #                   forms=False,
       
    55     #                   annoying_tags=False,
       
    56     #                   remove_tags=(),
       
    57     #                   remove_unknown_tags=False,
       
    58     #                   safe_attrs_only=False,
       
    59     #                   add_nofollow=False)
       
    60     # >>> cleaner.clean_html('<body></body>')
       
    61     # '<span></span>'
       
    62     # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
       
    63     # '<html><body></body></html>'
       
    64     # >>> cleaner.clean_html('<body><div/></body>')
       
    65     # '<div></div>'
       
    66     # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
       
    67     # '<html><body><div></div><br></body></html>'
       
    68     # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
       
    69     # '<html><body><div></div><br><span></span></body></html>'
       
    70     #
       
    71     # using that, we'll miss most actual validation error we want to
       
    72     # catch. For now, use dumb regexp
       
    73     return _REM_SCRIPT_RGX.sub('', data)
       
    74 
       
    75 
    34 class Validator(object):
    76 class Validator(object):
    35 
    77     """ base validator API """
    36     def parse_string(self, data, sysid=None):
    78     parser = None
       
    79 
       
    80     def parse_string(self, source):
       
    81         etree = self._parse(self.preprocess_data(source))
       
    82         return PageInfo(source, etree)
       
    83 
       
    84     def preprocess_data(self, data):
       
    85         return data
       
    86 
       
    87     def _parse(self, pdata):
    37         try:
    88         try:
    38             data = self.preprocess_data(data)
    89             return etree.fromstring(pdata, self.parser)
    39             return PageInfo(data, etree.fromstring(data, self.parser))
       
    40         except etree.XMLSyntaxError as exc:
    90         except etree.XMLSyntaxError as exc:
    41             def save_in(fname=''):
    91             def save_in(fname=''):
    42                 file(fname, 'w').write(data)
    92                 file(fname, 'w').write(data)
    43             new_exc = AssertionError(u'invalid xml %s' % exc)
    93             new_exc = AssertionError(u'invalid document: %s' % exc)
    44             new_exc.position = exc.position
    94             new_exc.position = exc.position
    45             raise new_exc
    95             raise new_exc
    46 
       
    47     def preprocess_data(self, data):
       
    48         return data
       
    49 
    96 
    50 
    97 
    51 class DTDValidator(Validator):
    98 class DTDValidator(Validator):
    52     def __init__(self):
    99     def __init__(self):
    53         Validator.__init__(self)
   100         Validator.__init__(self)
    58         """used to fix potential blockquote mess generated by docutils"""
   105         """used to fix potential blockquote mess generated by docutils"""
    59         if STRICT_DOCTYPE not in data:
   106         if STRICT_DOCTYPE not in data:
    60             return data
   107             return data
    61         # parse using transitional DTD
   108         # parse using transitional DTD
    62         data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
   109         data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
    63         tree = etree.fromstring(data, self.parser)
   110         tree = self._parse(data)
    64         namespace = tree.nsmap.get(None)
   111         namespace = tree.nsmap.get(None)
    65         # this is the list of authorized child tags for <blockquote> nodes
   112         # this is the list of authorized child tags for <blockquote> nodes
    66         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
   113         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
    67                    'fieldset table form noscript ins del script'.split()
   114                    'fieldset table form noscript ins del script'.split()
    68         if namespace:
   115         if namespace:
    77         data = etree.tostring(tree)
   124         data = etree.tostring(tree)
    78         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
   125         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
    79             STRICT_DOCTYPE, data)
   126             STRICT_DOCTYPE, data)
    80 
   127 
    81 
   128 
    82 class SaxOnlyValidator(Validator):
   129 class XMLValidator(Validator):
       
   130     """XML validator, checks that XML is well-formed and used XMLNS are defined"""
    83 
   131 
    84     def __init__(self):
   132     def __init__(self):
    85         Validator.__init__(self)
   133         Validator.__init__(self)
    86         self.parser = etree.XMLParser()
   134         self.parser = etree.XMLParser()
    87 
   135 
    88 
   136 SaxOnlyValidator = class_renamed('SaxOnlyValidator',
    89 class XMLDemotingValidator(SaxOnlyValidator):
   137                                  XMLValidator,
       
   138                                  '[3.17] you should use the '
       
   139                                  'XMLValidator class instead')
       
   140 
       
   141 
       
   142 class XMLSyntaxValidator(Validator):
       
   143     """XML syntax validator, check XML is well-formed"""
       
   144 
       
   145     class MySaxErrorHandler(sax.ErrorHandler):
       
   146         """override default handler to avoid choking because of unknown entity"""
       
   147         def fatalError(self, exception):
       
   148             # XXX check entity in htmlentitydefs
       
   149             if not str(exception).endswith('undefined entity'):
       
   150                 raise exception
       
   151     _parser = sax.make_parser()
       
   152     _parser.setContentHandler(sax.handler.ContentHandler())
       
   153     _parser.setErrorHandler(MySaxErrorHandler())
       
   154 
       
   155     def __init__(self):
       
   156         super(XMLSyntaxValidator, self).__init__()
       
   157         # XMLParser() wants xml namespaces defined
       
   158         # XMLParser(recover=True) will accept almost anything
       
   159         #
       
   160         # -> use the later but preprocess will check xml well-formness using a
       
   161         #    dumb SAX parser
       
   162         self.parser = etree.XMLParser(recover=True)
       
   163 
       
   164     def preprocess_data(self, data):
       
   165         return _remove_script_tags(data)
       
   166 
       
   167     def _parse(self, data):
       
   168         inpsrc = sax.InputSource()
       
   169         inpsrc.setByteStream(StringIO(data))
       
   170         try:
       
   171             self._parser.parse(inpsrc)
       
   172         except sax.SAXParseException, exc:
       
   173             new_exc = AssertionError(u'invalid document: %s' % exc)
       
   174             new_exc.position = (exc._linenum, exc._colnum)
       
   175             raise new_exc
       
   176         return super(XMLSyntaxValidator, self)._parse(data)
       
   177 
       
   178 
       
   179 class XMLDemotingValidator(XMLValidator):
    90     """ some views produce html instead of xhtml, using demote_to_html
   180     """ some views produce html instead of xhtml, using demote_to_html
    91 
   181 
    92     this is typically related to the use of external dependencies
   182     this is typically related to the use of external dependencies
    93     which do not produce valid xhtml (google maps, ...)
   183     which do not produce valid xhtml (google maps, ...)
    94     """
   184     """
    95     __metaclass__ = class_deprecated
   185     __metaclass__ = class_deprecated
       
   186     __deprecation_warning__ = '[3.10] this is now handled in testlib.py'
    96 
   187 
    97     def preprocess_data(self, data):
   188     def preprocess_data(self, data):
    98         if data.startswith('<?xml'):
   189         if data.startswith('<?xml'):
    99             self.parser = etree.XMLParser()
   190             self.parser = etree.XMLParser()
   100         else:
   191         else:
   104 
   195 
   105 class HTMLValidator(Validator):
   196 class HTMLValidator(Validator):
   106 
   197 
   107     def __init__(self):
   198     def __init__(self):
   108         Validator.__init__(self)
   199         Validator.__init__(self)
   109         self.parser = etree.HTMLParser()
   200         self.parser = etree.HTMLParser(recover=False)
   110 
   201 
       
   202     def preprocess_data(self, data):
       
   203         return _remove_script_tags(data)
   111 
   204 
   112 
   205 
   113 class PageInfo(object):
   206 class PageInfo(object):
   114     """holds various informations on the view's output"""
   207     """holds various informations on the view's output"""
   115     def __init__(self, source, root):
   208     def __init__(self, source, root):
   116         self.source = source
   209         self.source = source
   117         self.etree = root
   210         self.etree = root
   118         self.source = source
       
   119         self.raw_text = u''.join(root.xpath('//text()'))
   211         self.raw_text = u''.join(root.xpath('//text()'))
   120         self.namespace = self.etree.nsmap
   212         self.namespace = self.etree.nsmap
   121         self.default_ns = self.namespace.get(None)
   213         self.default_ns = self.namespace.get(None)
   122         self.a_tags = self.find_tag('a')
   214         self.a_tags = self.find_tag('a')
   123         self.h1_tags = self.find_tag('h1')
   215         self.h1_tags = self.find_tag('h1')
   232                         return True
   324                         return True
   233                 except KeyError:
   325                 except KeyError:
   234                     continue
   326                     continue
   235         return False
   327         return False
   236 
   328 
   237 VALMAP = {None: None, 'dtd': DTDValidator, 'xml': SaxOnlyValidator}
   329 VALMAP = {None: None,
       
   330           'dtd': DTDValidator,
       
   331           'xml': XMLValidator,
       
   332           'html': HTMLValidator,
       
   333           }