devtools/htmlparser.py
changeset 8977 57e564c0118e
parent 8973 6711f78c18be
child 8979 8f5416b1562a
equal deleted inserted replaced
8976:aeb7d400ee92 8977:57e564c0118e
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
    18 """defines a validating HTML parser used in web application tests"""
    18 """defines a validating HTML parser used in web application tests"""
    19 
    19 
    20 import re
    20 import re
    21 import sys
    21 import sys
       
    22 from xml import sax
       
    23 from cStringIO import StringIO
    22 
    24 
    23 from lxml import etree
    25 from lxml import etree
    24 
    26 
    25 from logilab.common.deprecation import class_deprecated, class_renamed
    27 from logilab.common.deprecation import class_deprecated, class_renamed
    26 
    28 
    29 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
    31 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
    30 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    31 
    33 
    32 ERR_COUNT = 0
    34 ERR_COUNT = 0
    33 
    35 
       
    36 _REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
       
    37 def _remove_script_tags(data):
       
    38     """Remove the script (usually javascript) tags to help the lxml
       
    39     XMLParser / HTMLParser do their job. Without that, they choke on
       
    40     tags embedded in JS strings.
       
    41     """
       
    42     # Notice we may want to use lxml cleaner, but it's far too intrusive:
       
    43     #
       
    44     # cleaner = Cleaner(scripts=True,
       
    45     #                   javascript=False,
       
    46     #                   comments=False,
       
    47     #                   style=False,
       
    48     #                   links=False,
       
    49     #                   meta=False,
       
    50     #                   page_structure=False,
       
    51     #                   processing_instructions=False,
       
    52     #                   embedded=False,
       
    53     #                   frames=False,
       
    54     #                   forms=False,
       
    55     #                   annoying_tags=False,
       
    56     #                   remove_tags=(),
       
    57     #                   remove_unknown_tags=False,
       
    58     #                   safe_attrs_only=False,
       
    59     #                   add_nofollow=False)
       
    60     # >>> cleaner.clean_html('<body></body>')
       
    61     # '<span></span>'
       
    62     # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
       
    63     # '<html><body></body></html>'
       
    64     # >>> cleaner.clean_html('<body><div/></body>')
       
    65     # '<div></div>'
       
    66     # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
       
    67     # '<html><body><div></div><br></body></html>'
       
    68     # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
       
    69     # '<html><body><div></div><br><span></span></body></html>'
       
    70     #
       
    71     # using that, we'll miss most actual validation error we want to
       
    72     # catch. For now, use dumb regexp
       
    73     return _REM_SCRIPT_RGX.sub('', data)
       
    74 
       
    75 
    34 class Validator(object):
    76 class Validator(object):
       
    77     """ base validator API """
    35     parser = None
    78     parser = None
    36 
    79 
    37     def parse_string(self, source):
    80     def parse_string(self, source):
    38         etree = self._parse(self.preprocess_data(source))
    81         etree = self._parse(self.preprocess_data(source))
    39         return PageInfo(source, etree)
    82         return PageInfo(source, etree)
    82         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
   125         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
    83             STRICT_DOCTYPE, data)
   126             STRICT_DOCTYPE, data)
    84 
   127 
    85 
   128 
    86 class XMLValidator(Validator):
   129 class XMLValidator(Validator):
    87     """ A fully compliant XML parser """
   130     """XML validator, checks that XML is well-formed and used XMLNS are defined"""
    88 
   131 
    89     def __init__(self):
   132     def __init__(self):
    90         Validator.__init__(self)
   133         Validator.__init__(self)
    91         self.parser = etree.XMLParser()
   134         self.parser = etree.XMLParser()
    92 
   135 
    93 SaxOnlyValidator = class_renamed('SaxOnlyValidator',
   136 SaxOnlyValidator = class_renamed('SaxOnlyValidator',
    94                                  XMLValidator,
   137                                  XMLValidator,
    95                                  '[3.17] you should use the '
   138                                  '[3.17] you should use the '
    96                                  'XMLValidator class instead')
   139                                  'XMLValidator class instead')
       
   140 
       
   141 
       
   142 class XMLSyntaxValidator(Validator):
       
   143     """XML syntax validator, check XML is well-formed"""
       
   144 
       
   145     class MySaxErrorHandler(sax.ErrorHandler):
       
   146         """override default handler to avoid choking because of unknown entity"""
       
   147         def fatalError(self, exception):
       
   148             # XXX check entity in htmlentitydefs
       
   149             if not str(exception).endswith('undefined entity'):
       
   150                 raise exception
       
   151     _parser = sax.make_parser()
       
   152     _parser.setContentHandler(sax.handler.ContentHandler())
       
   153     _parser.setErrorHandler(MySaxErrorHandler())
       
   154 
       
   155     def __init__(self):
       
   156         super(XMLSyntaxValidator, self).__init__()
       
   157         # XMLParser() wants xml namespaces defined
       
   158         # XMLParser(recover=True) will accept almost anything
       
   159         #
       
   160         # -> use the later but preprocess will check xml well-formness using a
       
   161         #    dumb SAX parser
       
   162         self.parser = etree.XMLParser(recover=True)
       
   163 
       
   164     def preprocess_data(self, data):
       
   165         return _remove_script_tags(data)
       
   166 
       
   167     def _parse(self, data):
       
   168         inpsrc = sax.InputSource()
       
   169         inpsrc.setByteStream(StringIO(data))
       
   170         try:
       
   171             self._parser.parse(inpsrc)
       
   172         except sax.SAXParseException, exc:
       
   173             new_exc = AssertionError(u'invalid document: %s' % exc)
       
   174             new_exc.position = (exc._linenum, exc._colnum)
       
   175             raise new_exc
       
   176         return super(XMLSyntaxValidator, self)._parse(data)
       
   177 
    97 
   178 
    98 class XMLDemotingValidator(XMLValidator):
   179 class XMLDemotingValidator(XMLValidator):
    99     """ some views produce html instead of xhtml, using demote_to_html
   180     """ some views produce html instead of xhtml, using demote_to_html
   100 
   181 
   101     this is typically related to the use of external dependencies
   182     this is typically related to the use of external dependencies
   110         else:
   191         else:
   111             self.parser = etree.HTMLParser()
   192             self.parser = etree.HTMLParser()
   112         return data
   193         return data
   113 
   194 
   114 
   195 
   115 REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
       
   116 
       
   117 class HTMLValidator(Validator):
   196 class HTMLValidator(Validator):
   118 
   197 
   119     def __init__(self):
   198     def __init__(self):
   120         Validator.__init__(self)
   199         Validator.__init__(self)
   121         self.parser = etree.HTMLParser(recover=False)
   200         self.parser = etree.HTMLParser(recover=False)
   122 
   201 
   123     def preprocess_data(self, data):
   202     def preprocess_data(self, data):
   124         """ Here we essentially wipe the javascript tags to help the HTMLParser
   203         return _remove_script_tags(data)
   125         do its job. Without that, it chokes on tags embedded in JS strings.
       
   126         """
       
   127         # Notice we may want to use lxml cleaner, but it's far too intrusive:
       
   128         #
       
   129         # cleaner = Cleaner(scripts=True,
       
   130         #                   javascript=False,
       
   131         #                   comments=False,
       
   132         #                   style=False,
       
   133         #                   links=False,
       
   134         #                   meta=False,
       
   135         #                   page_structure=False,
       
   136         #                   processing_instructions=False,
       
   137         #                   embedded=False,
       
   138         #                   frames=False,
       
   139         #                   forms=False,
       
   140         #                   annoying_tags=False,
       
   141         #                   remove_tags=(),
       
   142         #                   remove_unknown_tags=False,
       
   143         #                   safe_attrs_only=False,
       
   144         #                   add_nofollow=False)
       
   145         # >>> cleaner.clean_html('<body></body>')
       
   146         # '<span></span>'
       
   147         # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
       
   148         # '<html><body></body></html>'
       
   149         # >>> cleaner.clean_html('<body><div/></body>')
       
   150         # '<div></div>'
       
   151         # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
       
   152         # '<html><body><div></div><br></body></html>'
       
   153         # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
       
   154         # '<html><body><div></div><br><span></span></body></html>'
       
   155         #
       
   156         # using that, we'll miss most actual validation error we want to
       
   157         # catch. For now, use dumb regexp
       
   158         return REM_SCRIPT_RGX.sub('', data)
       
   159 
   204 
   160 
   205 
   161 class PageInfo(object):
   206 class PageInfo(object):
   162     """holds various informations on the view's output"""
   207     """holds various informations on the view's output"""
   163     def __init__(self, source, root):
   208     def __init__(self, source, root):