devtools/htmlparser.py
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Mon, 06 May 2013 16:05:51 +0200
changeset 8972 771337c3a754
parent 8951 5cfc0387d63f
child 8973 6711f78c18be
permissions -rw-r--r--
[testlib] update htmlparsers.VALMAP: stop using SaxOnlyValidator and add an entry for html

# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This file is part of CubicWeb.
#
# CubicWeb is free software: you can redistribute it and/or modify it under the
# terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
"""defines a validating HTML parser used in web application tests"""

import re
import sys

from lxml import etree

from logilab.common.deprecation import class_deprecated, class_renamed

from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE

STRICT_DOCTYPE = str(STRICT_DOCTYPE)
TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)

ERR_COUNT = 0

class Validator(object):

    def parse_string(self, data, sysid=None):
        try:
            return PageInfo(self, data)
        except etree.XMLSyntaxError as exc:
            def save_in(fname=''):
                file(fname, 'w').write(data)
            new_exc = AssertionError(u'invalid document: %s' % exc)
            new_exc.position = exc.position
            raise new_exc

    def preprocess_data(self, data):
        return data


class DTDValidator(Validator):
    def __init__(self):
        Validator.__init__(self)
        # XXX understand what's happening under windows
        self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32')

    def preprocess_data(self, data):
        """used to fix potential blockquote mess generated by docutils"""
        if STRICT_DOCTYPE not in data:
            return data
        # parse using transitional DTD
        data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
        tree = etree.fromstring(data, self.parser)
        namespace = tree.nsmap.get(None)
        # this is the list of authorized child tags for <blockquote> nodes
        expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
                   'fieldset table form noscript ins del script'.split()
        if namespace:
            blockquotes = tree.findall('.//{%s}blockquote' % namespace)
            expected = ['{%s}%s' % (namespace, tag) for tag in expected]
        else:
            blockquotes = tree.findall('.//blockquote')
        # quick and dirty approach: remove all blockquotes
        for blockquote in blockquotes:
            parent = blockquote.getparent()
            parent.remove(blockquote)
        data = etree.tostring(tree)
        return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
            STRICT_DOCTYPE, data)


class XMLValidator(Validator):
    """ A fully compliant XML parser """

    def __init__(self):
        Validator.__init__(self)
        self.parser = etree.XMLParser()

SaxOnlyValidator = class_renamed('SaxOnlyValidator',
                                 XMLValidator,
                                 '[3.17] you should use the '
                                 'XMLValidator class instead')

class XMLDemotingValidator(XMLValidator):
    """ some views produce html instead of xhtml, using demote_to_html

    this is typically related to the use of external dependencies
    which do not produce valid xhtml (google maps, ...)
    """
    __metaclass__ = class_deprecated
    __deprecation_warning__ = '[3.10] this is now handled in testlib.py'

    def preprocess_data(self, data):
        if data.startswith('<?xml'):
            self.parser = etree.XMLParser()
        else:
            self.parser = etree.HTMLParser()
        return data


REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)

class HTMLValidator(Validator):

    def __init__(self):
        Validator.__init__(self)
        self.parser = etree.HTMLParser(recover=False)

    def preprocess_data(self, data):
        """ Here we essentially wipe the javascript tags to help the HTMLParser
        do its job. Without that, it chokes on tags embedded in JS strings.
        """
        # Notice we may want to use lxml cleaner, but it's far too intrusive:
        #
        # cleaner = Cleaner(scripts=True,
        #                   javascript=False,
        #                   comments=False,
        #                   style=False,
        #                   links=False,
        #                   meta=False,
        #                   page_structure=False,
        #                   processing_instructions=False,
        #                   embedded=False,
        #                   frames=False,
        #                   forms=False,
        #                   annoying_tags=False,
        #                   remove_tags=(),
        #                   remove_unknown_tags=False,
        #                   safe_attrs_only=False,
        #                   add_nofollow=False)
        # >>> cleaner.clean_html('<body></body>')
        # '<span></span>'
        # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
        # '<html><body></body></html>'
        # >>> cleaner.clean_html('<body><div/></body>')
        # '<div></div>'
        # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
        # '<html><body><div></div><br></body></html>'
        # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
        # '<html><body><div></div><br><span></span></body></html>'
        #
        # using that, we'll miss most actual validation error we want to
        # catch. For now, use dumb regexp
        return REM_SCRIPT_RGX.sub('', data)


class PageInfo(object):
    """holds various informations on the view's output"""
    def __init__(self, validator, source):
        self.source = source
        root = etree.fromstring(validator.preprocess_data(source), validator.parser)
        self.etree = root
        self.source = source
        self.raw_text = u''.join(root.xpath('//text()'))
        self.namespace = self.etree.nsmap
        self.default_ns = self.namespace.get(None)
        self.a_tags = self.find_tag('a')
        self.h1_tags = self.find_tag('h1')
        self.h2_tags = self.find_tag('h2')
        self.h3_tags = self.find_tag('h3')
        self.h4_tags = self.find_tag('h4')
        self.input_tags = self.find_tag('input')
        self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]

    def _iterstr(self, tag):
        if self.default_ns is None:
            return ".//%s" % tag
        else:
            return ".//{%s}%s" % (self.default_ns, tag)

    def matching_nodes(self, tag, **attrs):
        for elt in self.etree.iterfind(self._iterstr(tag)):
            eltattrs  = elt.attrib
            for attr, value in attrs.iteritems():
                try:
                    if eltattrs[attr] != value:
                        break
                except KeyError:
                    break
            else: # all attributes match
                yield elt

    def has_tag(self, tag, nboccurs=1, **attrs):
        """returns True if tag with given attributes appears in the page
        `nbtimes` (any if None)
        """
        for elt in self.matching_nodes(tag, **attrs):
            if nboccurs is None: # no need to check number of occurences
                return True
            if not nboccurs: # too much occurences
                return False
            nboccurs -= 1
        if nboccurs == 0: # correct number of occurences
            return True
        return False # no matching tag/attrs

    def find_tag(self, tag, gettext=True):
        """return a list which contains text of all "tag" elements """
        iterstr = self._iterstr(tag)
        if not gettext or tag in ('a', 'input'):
            return [(elt.text, elt.attrib)
                    for elt in self.etree.iterfind(iterstr)]
        return [u''.join(elt.xpath('.//text()'))
                for elt in self.etree.iterfind(iterstr)]

    def appears(self, text):
        """returns True if <text> appears in the page"""
        return text in self.raw_text

    def __contains__(self, text):
        return text in self.source

    def has_title(self, text, level=None):
        """returns True if <h?>text</h?>

        :param level: the title's level (1 for h1, 2 for h2, etc.)
        """
        if level is None:
            for hlist in self.title_tags:
                if text in hlist:
                    return True
            return False
        else:
            hlist = self.title_tags[level - 1]
            return text in hlist

    def has_title_regexp(self, pattern, level=None):
        """returns True if <h?>pattern</h?>"""
        sre = re.compile(pattern)
        if level is None:
            for hlist in self.title_tags:
                for title in hlist:
                    if sre.match(title):
                        return True
            return False
        else:
            hlist = self.title_tags[level - 1]
            for title in hlist:
                if sre.match(title):
                    return True
            return False

    def has_link(self, text, url=None):
        """returns True if <a href=url>text</a> was found in the page"""
        for link_text, attrs in self.a_tags:
            if text == link_text:
                if url is None:
                    return True
                try:
                    href = attrs['href']
                    if href == url:
                        return True
                except KeyError:
                    continue
        return False

    def has_link_regexp(self, pattern, url=None):
        """returns True if <a href=url>pattern</a> was found in the page"""
        sre = re.compile(pattern)
        for link_text, attrs in self.a_tags:
            if sre.match(link_text):
                if url is None:
                    return True
                try:
                    href = attrs['href']
                    if href == url:
                        return True
                except KeyError:
                    continue
        return False

VALMAP = {None: None,
          'dtd': DTDValidator,
          'xml': XMLValidator,
          'html': HTMLValidator,
          }