devtools/htmlparser.py
changeset 11057 0b59724cb3f2
parent 11052 058bb3dc685f
child 11058 23eb30449fe5
--- a/devtools/htmlparser.py	Mon Jan 04 18:40:30 2016 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,314 +0,0 @@
-# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
-#
-# This file is part of CubicWeb.
-#
-# CubicWeb is free software: you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
-"""defines a validating HTML parser used in web application tests"""
-
-import re
-import sys
-from xml import sax
-from io import BytesIO
-
-from lxml import etree
-
-from logilab.common.deprecation import class_deprecated, class_renamed
-
-from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
-
-STRICT_DOCTYPE = str(STRICT_DOCTYPE)
-TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
-
-ERR_COUNT = 0
-
-_REM_SCRIPT_RGX = re.compile(br"<script[^>]*>.*?</script>", re.M|re.I|re.S)
-def _remove_script_tags(data):
-    """Remove the script (usually javascript) tags to help the lxml
-    XMLParser / HTMLParser do their job. Without that, they choke on
-    tags embedded in JS strings.
-    """
-    # Notice we may want to use lxml cleaner, but it's far too intrusive:
-    #
-    # cleaner = Cleaner(scripts=True,
-    #                   javascript=False,
-    #                   comments=False,
-    #                   style=False,
-    #                   links=False,
-    #                   meta=False,
-    #                   page_structure=False,
-    #                   processing_instructions=False,
-    #                   embedded=False,
-    #                   frames=False,
-    #                   forms=False,
-    #                   annoying_tags=False,
-    #                   remove_tags=(),
-    #                   remove_unknown_tags=False,
-    #                   safe_attrs_only=False,
-    #                   add_nofollow=False)
-    # >>> cleaner.clean_html('<body></body>')
-    # '<span></span>'
-    # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
-    # '<html><body></body></html>'
-    # >>> cleaner.clean_html('<body><div/></body>')
-    # '<div></div>'
-    # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
-    # '<html><body><div></div><br></body></html>'
-    # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
-    # '<html><body><div></div><br><span></span></body></html>'
-    #
-    # using that, we'll miss most actual validation error we want to
-    # catch. For now, use dumb regexp
-    return _REM_SCRIPT_RGX.sub(b'', data)
-
-
-class Validator(object):
-    """ base validator API """
-    parser = None
-
-    def parse_string(self, source):
-        etree = self._parse(self.preprocess_data(source))
-        return PageInfo(source, etree)
-
-    def preprocess_data(self, data):
-        return data
-
-    def _parse(self, pdata):
-        try:
-            return etree.fromstring(pdata, self.parser)
-        except etree.XMLSyntaxError as exc:
-            new_exc = AssertionError(u'invalid document: %s' % exc)
-            new_exc.position = exc.position
-            raise new_exc
-
-
-class DTDValidator(Validator):
-    def __init__(self):
-        Validator.__init__(self)
-        # XXX understand what's happening under windows
-        self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32')
-
-    def preprocess_data(self, data):
-        """used to fix potential blockquote mess generated by docutils"""
-        if STRICT_DOCTYPE not in data:
-            return data
-        # parse using transitional DTD
-        data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
-        tree = self._parse(data)
-        namespace = tree.nsmap.get(None)
-        # this is the list of authorized child tags for <blockquote> nodes
-        expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
-                   'fieldset table form noscript ins del script'.split()
-        if namespace:
-            blockquotes = tree.findall('.//{%s}blockquote' % namespace)
-            expected = ['{%s}%s' % (namespace, tag) for tag in expected]
-        else:
-            blockquotes = tree.findall('.//blockquote')
-        # quick and dirty approach: remove all blockquotes
-        for blockquote in blockquotes:
-            parent = blockquote.getparent()
-            parent.remove(blockquote)
-        data = etree.tostring(tree)
-        return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
-            STRICT_DOCTYPE, data)
-
-
-class XMLValidator(Validator):
-    """XML validator, checks that XML is well-formed and used XMLNS are defined"""
-
-    def __init__(self):
-        Validator.__init__(self)
-        self.parser = etree.XMLParser()
-
-SaxOnlyValidator = class_renamed('SaxOnlyValidator',
-                                 XMLValidator,
-                                 '[3.17] you should use the '
-                                 'XMLValidator class instead')
-
-
-class XMLSyntaxValidator(Validator):
-    """XML syntax validator, check XML is well-formed"""
-
-    class MySaxErrorHandler(sax.ErrorHandler):
-        """override default handler to avoid choking because of unknown entity"""
-        def fatalError(self, exception):
-            # XXX check entity in htmlentitydefs
-            if not str(exception).endswith('undefined entity'):
-                raise exception
-    _parser = sax.make_parser()
-    _parser.setContentHandler(sax.handler.ContentHandler())
-    _parser.setErrorHandler(MySaxErrorHandler())
-
-    def __init__(self):
-        super(XMLSyntaxValidator, self).__init__()
-        # XMLParser() wants xml namespaces defined
-        # XMLParser(recover=True) will accept almost anything
-        #
-        # -> use the later but preprocess will check xml well-formness using a
-        #    dumb SAX parser
-        self.parser = etree.XMLParser(recover=True)
-
-    def preprocess_data(self, data):
-        return _remove_script_tags(data)
-
-    def _parse(self, data):
-        inpsrc = sax.InputSource()
-        inpsrc.setByteStream(BytesIO(data))
-        try:
-            self._parser.parse(inpsrc)
-        except sax.SAXParseException as exc:
-            new_exc = AssertionError(u'invalid document: %s' % exc)
-            new_exc.position = (exc._linenum, exc._colnum)
-            raise new_exc
-        return super(XMLSyntaxValidator, self)._parse(data)
-
-
-class HTMLValidator(Validator):
-
-    def __init__(self):
-        Validator.__init__(self)
-        self.parser = etree.HTMLParser(recover=False)
-
-    def preprocess_data(self, data):
-        return _remove_script_tags(data)
-
-
-class PageInfo(object):
-    """holds various informations on the view's output"""
-    def __init__(self, source, root):
-        self.source = source
-        self.etree = root
-        self.raw_text = u''.join(root.xpath('//text()'))
-        self.namespace = self.etree.nsmap
-        self.default_ns = self.namespace.get(None)
-        self.a_tags = self.find_tag('a')
-        self.h1_tags = self.find_tag('h1')
-        self.h2_tags = self.find_tag('h2')
-        self.h3_tags = self.find_tag('h3')
-        self.h4_tags = self.find_tag('h4')
-        self.input_tags = self.find_tag('input')
-        self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
-
-    def _iterstr(self, tag):
-        if self.default_ns is None:
-            return ".//%s" % tag
-        else:
-            return ".//{%s}%s" % (self.default_ns, tag)
-
-    def matching_nodes(self, tag, **attrs):
-        for elt in self.etree.iterfind(self._iterstr(tag)):
-            eltattrs  = elt.attrib
-            for attr, value in attrs.items():
-                try:
-                    if eltattrs[attr] != value:
-                        break
-                except KeyError:
-                    break
-            else: # all attributes match
-                yield elt
-
-    def has_tag(self, tag, nboccurs=1, **attrs):
-        """returns True if tag with given attributes appears in the page
-        `nbtimes` (any if None)
-        """
-        for elt in self.matching_nodes(tag, **attrs):
-            if nboccurs is None: # no need to check number of occurences
-                return True
-            if not nboccurs: # too much occurences
-                return False
-            nboccurs -= 1
-        if nboccurs == 0: # correct number of occurences
-            return True
-        return False # no matching tag/attrs
-
-    def find_tag(self, tag, gettext=True):
-        """return a list which contains text of all "tag" elements """
-        iterstr = self._iterstr(tag)
-        if not gettext or tag in ('a', 'input'):
-            return [(elt.text, elt.attrib)
-                    for elt in self.etree.iterfind(iterstr)]
-        return [u''.join(elt.xpath('.//text()'))
-                for elt in self.etree.iterfind(iterstr)]
-
-    def appears(self, text):
-        """returns True if <text> appears in the page"""
-        return text in self.raw_text
-
-    def __contains__(self, text):
-        return text in self.source
-
-    def has_title(self, text, level=None):
-        """returns True if <h?>text</h?>
-
-        :param level: the title's level (1 for h1, 2 for h2, etc.)
-        """
-        if level is None:
-            for hlist in self.title_tags:
-                if text in hlist:
-                    return True
-            return False
-        else:
-            hlist = self.title_tags[level - 1]
-            return text in hlist
-
-    def has_title_regexp(self, pattern, level=None):
-        """returns True if <h?>pattern</h?>"""
-        sre = re.compile(pattern)
-        if level is None:
-            for hlist in self.title_tags:
-                for title in hlist:
-                    if sre.match(title):
-                        return True
-            return False
-        else:
-            hlist = self.title_tags[level - 1]
-            for title in hlist:
-                if sre.match(title):
-                    return True
-            return False
-
-    def has_link(self, text, url=None):
-        """returns True if <a href=url>text</a> was found in the page"""
-        for link_text, attrs in self.a_tags:
-            if text == link_text:
-                if url is None:
-                    return True
-                try:
-                    href = attrs['href']
-                    if href == url:
-                        return True
-                except KeyError:
-                    continue
-        return False
-
-    def has_link_regexp(self, pattern, url=None):
-        """returns True if <a href=url>pattern</a> was found in the page"""
-        sre = re.compile(pattern)
-        for link_text, attrs in self.a_tags:
-            if sre.match(link_text):
-                if url is None:
-                    return True
-                try:
-                    href = attrs['href']
-                    if href == url:
-                        return True
-                except KeyError:
-                    continue
-        return False
-
-VALMAP = {None: None,
-          'dtd': DTDValidator,
-          'xml': XMLValidator,
-          'html': HTMLValidator,
-          }