cubicweb/devtools/htmlparser.py
changeset 11057 0b59724cb3f2
parent 10696 4ba4be5553cf
child 12505 55014a79b2a5
equal deleted inserted replaced
11052:058bb3dc685f 11057:0b59724cb3f2
       
     1 # copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
       
     2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
       
     3 #
       
     4 # This file is part of CubicWeb.
       
     5 #
       
     6 # CubicWeb is free software: you can redistribute it and/or modify it under the
       
     7 # terms of the GNU Lesser General Public License as published by the Free
       
     8 # Software Foundation, either version 2.1 of the License, or (at your option)
       
     9 # any later version.
       
    10 #
       
    11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT
       
    12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
       
    13 # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
       
    14 # details.
       
    15 #
       
    16 # You should have received a copy of the GNU Lesser General Public License along
       
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
       
    18 """defines a validating HTML parser used in web application tests"""
       
    19 
       
    20 import re
       
    21 import sys
       
    22 from xml import sax
       
    23 from io import BytesIO
       
    24 
       
    25 from lxml import etree
       
    26 
       
    27 from logilab.common.deprecation import class_deprecated, class_renamed
       
    28 
       
    29 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
       
    30 
       
    31 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
       
    32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
       
    33 
       
    34 ERR_COUNT = 0
       
    35 
       
    36 _REM_SCRIPT_RGX = re.compile(br"<script[^>]*>.*?</script>", re.M|re.I|re.S)
       
    37 def _remove_script_tags(data):
       
    38     """Remove the script (usually javascript) tags to help the lxml
       
    39     XMLParser / HTMLParser do their job. Without that, they choke on
       
    40     tags embedded in JS strings.
       
    41     """
       
    42     # Notice we may want to use lxml cleaner, but it's far too intrusive:
       
    43     #
       
    44     # cleaner = Cleaner(scripts=True,
       
    45     #                   javascript=False,
       
    46     #                   comments=False,
       
    47     #                   style=False,
       
    48     #                   links=False,
       
    49     #                   meta=False,
       
    50     #                   page_structure=False,
       
    51     #                   processing_instructions=False,
       
    52     #                   embedded=False,
       
    53     #                   frames=False,
       
    54     #                   forms=False,
       
    55     #                   annoying_tags=False,
       
    56     #                   remove_tags=(),
       
    57     #                   remove_unknown_tags=False,
       
    58     #                   safe_attrs_only=False,
       
    59     #                   add_nofollow=False)
       
    60     # >>> cleaner.clean_html('<body></body>')
       
    61     # '<span></span>'
       
    62     # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
       
    63     # '<html><body></body></html>'
       
    64     # >>> cleaner.clean_html('<body><div/></body>')
       
    65     # '<div></div>'
       
    66     # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
       
    67     # '<html><body><div></div><br></body></html>'
       
    68     # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
       
    69     # '<html><body><div></div><br><span></span></body></html>'
       
    70     #
       
    71     # using that, we'll miss most actual validation error we want to
       
    72     # catch. For now, use dumb regexp
       
    73     return _REM_SCRIPT_RGX.sub(b'', data)
       
    74 
       
    75 
       
    76 class Validator(object):
       
    77     """ base validator API """
       
    78     parser = None
       
    79 
       
    80     def parse_string(self, source):
       
    81         etree = self._parse(self.preprocess_data(source))
       
    82         return PageInfo(source, etree)
       
    83 
       
    84     def preprocess_data(self, data):
       
    85         return data
       
    86 
       
    87     def _parse(self, pdata):
       
    88         try:
       
    89             return etree.fromstring(pdata, self.parser)
       
    90         except etree.XMLSyntaxError as exc:
       
    91             new_exc = AssertionError(u'invalid document: %s' % exc)
       
    92             new_exc.position = exc.position
       
    93             raise new_exc
       
    94 
       
    95 
       
    96 class DTDValidator(Validator):
       
    97     def __init__(self):
       
    98         Validator.__init__(self)
       
    99         # XXX understand what's happening under windows
       
   100         self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32')
       
   101 
       
   102     def preprocess_data(self, data):
       
   103         """used to fix potential blockquote mess generated by docutils"""
       
   104         if STRICT_DOCTYPE not in data:
       
   105             return data
       
   106         # parse using transitional DTD
       
   107         data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
       
   108         tree = self._parse(data)
       
   109         namespace = tree.nsmap.get(None)
       
   110         # this is the list of authorized child tags for <blockquote> nodes
       
   111         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
       
   112                    'fieldset table form noscript ins del script'.split()
       
   113         if namespace:
       
   114             blockquotes = tree.findall('.//{%s}blockquote' % namespace)
       
   115             expected = ['{%s}%s' % (namespace, tag) for tag in expected]
       
   116         else:
       
   117             blockquotes = tree.findall('.//blockquote')
       
   118         # quick and dirty approach: remove all blockquotes
       
   119         for blockquote in blockquotes:
       
   120             parent = blockquote.getparent()
       
   121             parent.remove(blockquote)
       
   122         data = etree.tostring(tree)
       
   123         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
       
   124             STRICT_DOCTYPE, data)
       
   125 
       
   126 
       
   127 class XMLValidator(Validator):
       
   128     """XML validator, checks that XML is well-formed and used XMLNS are defined"""
       
   129 
       
   130     def __init__(self):
       
   131         Validator.__init__(self)
       
   132         self.parser = etree.XMLParser()
       
   133 
       
   134 SaxOnlyValidator = class_renamed('SaxOnlyValidator',
       
   135                                  XMLValidator,
       
   136                                  '[3.17] you should use the '
       
   137                                  'XMLValidator class instead')
       
   138 
       
   139 
       
   140 class XMLSyntaxValidator(Validator):
       
   141     """XML syntax validator, check XML is well-formed"""
       
   142 
       
   143     class MySaxErrorHandler(sax.ErrorHandler):
       
   144         """override default handler to avoid choking because of unknown entity"""
       
   145         def fatalError(self, exception):
       
   146             # XXX check entity in htmlentitydefs
       
   147             if not str(exception).endswith('undefined entity'):
       
   148                 raise exception
       
   149     _parser = sax.make_parser()
       
   150     _parser.setContentHandler(sax.handler.ContentHandler())
       
   151     _parser.setErrorHandler(MySaxErrorHandler())
       
   152 
       
   153     def __init__(self):
       
   154         super(XMLSyntaxValidator, self).__init__()
       
   155         # XMLParser() wants xml namespaces defined
       
   156         # XMLParser(recover=True) will accept almost anything
       
   157         #
       
   158         # -> use the later but preprocess will check xml well-formness using a
       
   159         #    dumb SAX parser
       
   160         self.parser = etree.XMLParser(recover=True)
       
   161 
       
   162     def preprocess_data(self, data):
       
   163         return _remove_script_tags(data)
       
   164 
       
   165     def _parse(self, data):
       
   166         inpsrc = sax.InputSource()
       
   167         inpsrc.setByteStream(BytesIO(data))
       
   168         try:
       
   169             self._parser.parse(inpsrc)
       
   170         except sax.SAXParseException as exc:
       
   171             new_exc = AssertionError(u'invalid document: %s' % exc)
       
   172             new_exc.position = (exc._linenum, exc._colnum)
       
   173             raise new_exc
       
   174         return super(XMLSyntaxValidator, self)._parse(data)
       
   175 
       
   176 
       
   177 class HTMLValidator(Validator):
       
   178 
       
   179     def __init__(self):
       
   180         Validator.__init__(self)
       
   181         self.parser = etree.HTMLParser(recover=False)
       
   182 
       
   183     def preprocess_data(self, data):
       
   184         return _remove_script_tags(data)
       
   185 
       
   186 
       
   187 class PageInfo(object):
       
   188     """holds various informations on the view's output"""
       
   189     def __init__(self, source, root):
       
   190         self.source = source
       
   191         self.etree = root
       
   192         self.raw_text = u''.join(root.xpath('//text()'))
       
   193         self.namespace = self.etree.nsmap
       
   194         self.default_ns = self.namespace.get(None)
       
   195         self.a_tags = self.find_tag('a')
       
   196         self.h1_tags = self.find_tag('h1')
       
   197         self.h2_tags = self.find_tag('h2')
       
   198         self.h3_tags = self.find_tag('h3')
       
   199         self.h4_tags = self.find_tag('h4')
       
   200         self.input_tags = self.find_tag('input')
       
   201         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
       
   202 
       
   203     def _iterstr(self, tag):
       
   204         if self.default_ns is None:
       
   205             return ".//%s" % tag
       
   206         else:
       
   207             return ".//{%s}%s" % (self.default_ns, tag)
       
   208 
       
   209     def matching_nodes(self, tag, **attrs):
       
   210         for elt in self.etree.iterfind(self._iterstr(tag)):
       
   211             eltattrs  = elt.attrib
       
   212             for attr, value in attrs.items():
       
   213                 try:
       
   214                     if eltattrs[attr] != value:
       
   215                         break
       
   216                 except KeyError:
       
   217                     break
       
   218             else: # all attributes match
       
   219                 yield elt
       
   220 
       
   221     def has_tag(self, tag, nboccurs=1, **attrs):
       
   222         """returns True if tag with given attributes appears in the page
       
   223         `nbtimes` (any if None)
       
   224         """
       
   225         for elt in self.matching_nodes(tag, **attrs):
       
   226             if nboccurs is None: # no need to check number of occurences
       
   227                 return True
       
   228             if not nboccurs: # too much occurences
       
   229                 return False
       
   230             nboccurs -= 1
       
   231         if nboccurs == 0: # correct number of occurences
       
   232             return True
       
   233         return False # no matching tag/attrs
       
   234 
       
   235     def find_tag(self, tag, gettext=True):
       
   236         """return a list which contains text of all "tag" elements """
       
   237         iterstr = self._iterstr(tag)
       
   238         if not gettext or tag in ('a', 'input'):
       
   239             return [(elt.text, elt.attrib)
       
   240                     for elt in self.etree.iterfind(iterstr)]
       
   241         return [u''.join(elt.xpath('.//text()'))
       
   242                 for elt in self.etree.iterfind(iterstr)]
       
   243 
       
   244     def appears(self, text):
       
   245         """returns True if <text> appears in the page"""
       
   246         return text in self.raw_text
       
   247 
       
   248     def __contains__(self, text):
       
   249         return text in self.source
       
   250 
       
   251     def has_title(self, text, level=None):
       
   252         """returns True if <h?>text</h?>
       
   253 
       
   254         :param level: the title's level (1 for h1, 2 for h2, etc.)
       
   255         """
       
   256         if level is None:
       
   257             for hlist in self.title_tags:
       
   258                 if text in hlist:
       
   259                     return True
       
   260             return False
       
   261         else:
       
   262             hlist = self.title_tags[level - 1]
       
   263             return text in hlist
       
   264 
       
   265     def has_title_regexp(self, pattern, level=None):
       
   266         """returns True if <h?>pattern</h?>"""
       
   267         sre = re.compile(pattern)
       
   268         if level is None:
       
   269             for hlist in self.title_tags:
       
   270                 for title in hlist:
       
   271                     if sre.match(title):
       
   272                         return True
       
   273             return False
       
   274         else:
       
   275             hlist = self.title_tags[level - 1]
       
   276             for title in hlist:
       
   277                 if sre.match(title):
       
   278                     return True
       
   279             return False
       
   280 
       
   281     def has_link(self, text, url=None):
       
   282         """returns True if <a href=url>text</a> was found in the page"""
       
   283         for link_text, attrs in self.a_tags:
       
   284             if text == link_text:
       
   285                 if url is None:
       
   286                     return True
       
   287                 try:
       
   288                     href = attrs['href']
       
   289                     if href == url:
       
   290                         return True
       
   291                 except KeyError:
       
   292                     continue
       
   293         return False
       
   294 
       
   295     def has_link_regexp(self, pattern, url=None):
       
   296         """returns True if <a href=url>pattern</a> was found in the page"""
       
   297         sre = re.compile(pattern)
       
   298         for link_text, attrs in self.a_tags:
       
   299             if sre.match(link_text):
       
   300                 if url is None:
       
   301                     return True
       
   302                 try:
       
   303                     href = attrs['href']
       
   304                     if href == url:
       
   305                         return True
       
   306                 except KeyError:
       
   307                     continue
       
   308         return False
       
   309 
       
   310 VALMAP = {None: None,
       
   311           'dtd': DTDValidator,
       
   312           'xml': XMLValidator,
       
   313           'html': HTMLValidator,
       
   314           }