devtools/htmlparser.py
author Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
Wed, 07 Oct 2009 19:59:46 +0200
branchstable
changeset 3608 5a46e68c3d3c
parent 3325 44caeccd2db9
child 3369 7b88d12b4ee2
child 4212 ab6573088b4a
permissions -rw-r--r--
[editcontroller] backout (sort of) removal of entity.complete() in validate_form entity is passed to js callbacks when there's one. In that case, we want as much information as we can. The removal was there to avoid complete to fail when all constraints (required relations / attributes) aren't satisfied. This fix only does the complete() after the commit is done, any ValidationError should have been raised at this point.

"""defines a validating HTML parser used in web application tests

:organization: Logilab
:copyright: 2001-2009 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2.
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses
"""

import re
import sys

from lxml import etree

from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
STRICT_DOCTYPE = str(STRICT_DOCTYPE)
TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)

ERR_COUNT = 0

class Validator(object):

    def parse_string(self, data, sysid=None):
        try:
            data = self.preprocess_data(data)
            return PageInfo(data, etree.fromstring(data, self.parser))
        except etree.XMLSyntaxError, exc:
            def save_in(fname=''):
                file(fname, 'w').write(data)
            new_exc = AssertionError(u'invalid xml %s' % exc)
            new_exc.position = exc.position
            raise new_exc

    def preprocess_data(self, data):
        return data


class DTDValidator(Validator):
    def __init__(self):
        Validator.__init__(self)
        # XXX understand what's happening under windows
        validate = True
        if sys.platform == 'win32':
            validate = False
        self.parser = etree.XMLParser(dtd_validation=validate)

    def preprocess_data(self, data):
        """used to fix potential blockquote mess generated by docutils"""
        if STRICT_DOCTYPE not in data:
            return data
        # parse using transitional DTD
        data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
        tree = etree.fromstring(data, self.parser)
        namespace = tree.nsmap.get(None)
        # this is the list of authorized child tags for <blockquote> nodes
        expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
                   'fieldset table form noscript ins del script'.split()
        if namespace:
            blockquotes = tree.findall('.//{%s}blockquote' % namespace)
            expected = ['{%s}%s' % (namespace, tag) for tag in expected]
        else:
            blockquotes = tree.findall('.//blockquote')
        # quick and dirty approach: remove all blockquotes
        for blockquote in blockquotes:
            parent = blockquote.getparent()
            parent.remove(blockquote)
        data = etree.tostring(tree)
        return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
            STRICT_DOCTYPE, data)


class SaxOnlyValidator(Validator):

    def __init__(self):
        Validator.__init__(self)
        self.parser = etree.XMLParser()

class HTMLValidator(Validator):

    def __init__(self):
        Validator.__init__(self)
        self.parser = etree.HTMLParser()



class PageInfo(object):
    """holds various informations on the view's output"""
    def __init__(self, source, root):
        self.source = source
        self.etree = root
        self.source = source
        self.raw_text = u''.join(root.xpath('//text()'))
        self.namespace = self.etree.nsmap
        self.default_ns = self.namespace.get(None)
        self.a_tags = self.find_tag('a')
        self.h1_tags = self.find_tag('h1')
        self.h2_tags = self.find_tag('h2')
        self.h3_tags = self.find_tag('h3')
        self.h4_tags = self.find_tag('h4')
        self.input_tags = self.find_tag('input')
        self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]

    def find_tag(self, tag, gettext=True):
        """return a list which contains text of all "tag" elements """
        if self.default_ns is None:
            iterstr = ".//%s" % tag
        else:
            iterstr = ".//{%s}%s" % (self.default_ns, tag)
        if not gettext or tag in ('a', 'input'):
            return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
        return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]

    def appears(self, text):
        """returns True if <text> appears in the page"""
        return text in self.raw_text

    def __contains__(self, text):
        return text in self.source

    def has_title(self, text, level=None):
        """returns True if <h?>text</h?>

        :param level: the title's level (1 for h1, 2 for h2, etc.)
        """
        if level is None:
            for hlist in self.title_tags:
                if text in hlist:
                    return True
            return False
        else:
            hlist = self.title_tags[level - 1]
            return text in hlist

    def has_title_regexp(self, pattern, level=None):
        """returns True if <h?>pattern</h?>"""
        sre = re.compile(pattern)
        if level is None:
            for hlist in self.title_tags:
                for title in hlist:
                    if sre.match(title):
                        return True
            return False
        else:
            hlist = self.title_tags[level - 1]
            for title in hlist:
                if sre.match(title):
                    return True
            return False

    def has_link(self, text, url=None):
        """returns True if <a href=url>text</a> was found in the page"""
        for link_text, attrs in self.a_tags:
            if text == link_text:
                if url is None:
                    return True
                try:
                    href = attrs['href']
                    if href == url:
                        return True
                except KeyError:
                    continue
        return False

    def has_link_regexp(self, pattern, url=None):
        """returns True if <a href=url>pattern</a> was found in the page"""
        sre = re.compile(pattern)
        for link_text, attrs in self.a_tags:
            if sre.match(link_text):
                if url is None:
                    return True
                try:
                    href = attrs['href']
                    if href == url:
                        return True
                except KeyError:
                    continue
        return False