devtools/htmlparser.py
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Fri, 19 Feb 2010 09:34:14 +0100
branchstable
changeset 4643 921737d2e3a8
parent 4252 6c4f109c2b03
child 5276 5037d891e207
permissions -rw-r--r--
fix optimisation with super session that may lead to integrity loss at some point I've decided to stop ensuring ?1 cardinality was respected when adding a new relation using a super session, to avoid the cost of the delete query. That was yet discussable because it introduced unexpected difference between execute and unsafe_execute, which is imo not worth it. Also, now that rql() in migration script default to unsafe_execute, we definitly don't want that implicit behaviour change (which already cause bug when for instance adding another default workflow for an entity type: without that fix we end up with *two* default workflows while the schema tells we can have only one. IMO we should go to the direction that super session skip all security check, but nothing else, unless explicitly asked.

"""defines a validating HTML parser used in web application tests

:organization: Logilab
:copyright: 2001-2010 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2.
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses
"""

import re
import sys

from lxml import etree

from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
STRICT_DOCTYPE = str(STRICT_DOCTYPE)
TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)

ERR_COUNT = 0

class Validator(object):

    def parse_string(self, data, sysid=None):
        try:
            data = self.preprocess_data(data)
            return PageInfo(data, etree.fromstring(data, self.parser))
        except etree.XMLSyntaxError, exc:
            def save_in(fname=''):
                file(fname, 'w').write(data)
            new_exc = AssertionError(u'invalid xml %s' % exc)
            new_exc.position = exc.position
            raise new_exc

    def preprocess_data(self, data):
        return data


class DTDValidator(Validator):
    def __init__(self):
        Validator.__init__(self)
        # XXX understand what's happening under windows
        validate = True
        if sys.platform == 'win32':
            validate = False
        self.parser = etree.XMLParser(dtd_validation=validate)

    def preprocess_data(self, data):
        """used to fix potential blockquote mess generated by docutils"""
        if STRICT_DOCTYPE not in data:
            return data
        # parse using transitional DTD
        data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
        tree = etree.fromstring(data, self.parser)
        namespace = tree.nsmap.get(None)
        # this is the list of authorized child tags for <blockquote> nodes
        expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
                   'fieldset table form noscript ins del script'.split()
        if namespace:
            blockquotes = tree.findall('.//{%s}blockquote' % namespace)
            expected = ['{%s}%s' % (namespace, tag) for tag in expected]
        else:
            blockquotes = tree.findall('.//blockquote')
        # quick and dirty approach: remove all blockquotes
        for blockquote in blockquotes:
            parent = blockquote.getparent()
            parent.remove(blockquote)
        data = etree.tostring(tree)
        return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
            STRICT_DOCTYPE, data)


class SaxOnlyValidator(Validator):

    def __init__(self):
        Validator.__init__(self)
        self.parser = etree.XMLParser()

class HTMLValidator(Validator):

    def __init__(self):
        Validator.__init__(self)
        self.parser = etree.HTMLParser()



class PageInfo(object):
    """holds various informations on the view's output"""
    def __init__(self, source, root):
        self.source = source
        self.etree = root
        self.source = source
        self.raw_text = u''.join(root.xpath('//text()'))
        self.namespace = self.etree.nsmap
        self.default_ns = self.namespace.get(None)
        self.a_tags = self.find_tag('a')
        self.h1_tags = self.find_tag('h1')
        self.h2_tags = self.find_tag('h2')
        self.h3_tags = self.find_tag('h3')
        self.h4_tags = self.find_tag('h4')
        self.input_tags = self.find_tag('input')
        self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]

    def find_tag(self, tag, gettext=True):
        """return a list which contains text of all "tag" elements """
        if self.default_ns is None:
            iterstr = ".//%s" % tag
        else:
            iterstr = ".//{%s}%s" % (self.default_ns, tag)
        if not gettext or tag in ('a', 'input'):
            return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
        return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]

    def appears(self, text):
        """returns True if <text> appears in the page"""
        return text in self.raw_text

    def __contains__(self, text):
        return text in self.source

    def has_title(self, text, level=None):
        """returns True if <h?>text</h?>

        :param level: the title's level (1 for h1, 2 for h2, etc.)
        """
        if level is None:
            for hlist in self.title_tags:
                if text in hlist:
                    return True
            return False
        else:
            hlist = self.title_tags[level - 1]
            return text in hlist

    def has_title_regexp(self, pattern, level=None):
        """returns True if <h?>pattern</h?>"""
        sre = re.compile(pattern)
        if level is None:
            for hlist in self.title_tags:
                for title in hlist:
                    if sre.match(title):
                        return True
            return False
        else:
            hlist = self.title_tags[level - 1]
            for title in hlist:
                if sre.match(title):
                    return True
            return False

    def has_link(self, text, url=None):
        """returns True if <a href=url>text</a> was found in the page"""
        for link_text, attrs in self.a_tags:
            if text == link_text:
                if url is None:
                    return True
                try:
                    href = attrs['href']
                    if href == url:
                        return True
                except KeyError:
                    continue
        return False

    def has_link_regexp(self, pattern, url=None):
        """returns True if <a href=url>pattern</a> was found in the page"""
        sre = re.compile(pattern)
        for link_text, attrs in self.a_tags:
            if sre.match(link_text):
                if url is None:
                    return True
                try:
                    href = attrs['href']
                    if href == url:
                        return True
                except KeyError:
                    continue
        return False

VALMAP = {None: None, 'dtd': DTDValidator, 'xml': SaxOnlyValidator}