devtools/htmlparser.py
author sylvain.thenault@logilab.fr
Thu, 02 Apr 2009 17:12:25 +0200
branch25480a710422
changeset 1217 b5beb1946019
parent 0 b97547f5f1fa
child 781 323656dd85a9
permissions -rw-r--r--
oops, missing a not here
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     1
"""defines a validating HTML parser used in web application tests"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     2
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     3
import re
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     4
from StringIO import StringIO
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     5
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     6
from lxml import etree
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     7
from lxml.builder import E
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     8
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
     9
from cubicweb.common.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE, CW_XHTML_EXTENSIONS
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    10
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    11
STRICT_DOCTYPE = str(STRICT_DOCTYPE % CW_XHTML_EXTENSIONS).strip()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    12
TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE % CW_XHTML_EXTENSIONS).strip()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    13
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    14
ERR_COUNT = 0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    15
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    16
class Validator(object):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    17
    
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    18
    def parse_string(self, data, sysid=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    19
        try:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    20
            data = self.preprocess_data(data)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    21
            return PageInfo(data, etree.fromstring(data, self.parser))
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    22
        except etree.XMLSyntaxError, exc:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    23
            def save_in(fname=''):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    24
                file(fname, 'w').write(data)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    25
            new_exc = AssertionError(u'invalid xml %s' % exc)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    26
            new_exc.position = exc.position
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    27
            raise new_exc
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    28
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    29
    def preprocess_data(self, data):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    30
        return data
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    31
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    32
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    33
class DTDValidator(Validator):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    34
    def __init__(self):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    35
        Validator.__init__(self)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    36
        self.parser = etree.XMLParser(dtd_validation=True)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    37
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    38
    def preprocess_data(self, data):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    39
        """used to fix potential blockquote mess generated by docutils"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    40
        if STRICT_DOCTYPE not in data:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    41
            return data
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    42
        # parse using transitional DTD
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    43
        data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    44
        tree = etree.fromstring(data, self.parser)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    45
        namespace = tree.nsmap.get(None)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    46
        # this is the list of authorized child tags for <blockquote> nodes
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    47
        expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    48
                   'fieldset table form noscript ins del script'.split()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    49
        if namespace:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    50
            blockquotes = tree.findall('.//{%s}blockquote' % namespace)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    51
            expected = ['{%s}%s' % (namespace, tag) for tag in expected]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    52
        else:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    53
            blockquotes = tree.findall('.//blockquote')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    54
        # quick and dirty approach: remove all blockquotes
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    55
        for blockquote in blockquotes:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    56
            parent = blockquote.getparent()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    57
            parent.remove(blockquote)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    58
##         # for each blockquote, wrap unauthorized child in a div
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    59
##         for blockquote in blockquotes:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    60
##             if len(blockquote):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    61
##                 needs_wrap = [(index, child) for index, child in enumerate(blockquote)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    62
##                               if child.tag not in expected]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    63
##                 for index, child in needs_wrap:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    64
##                     # the child is automatically popped from blockquote when
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    65
##                     # its parent is changed
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    66
##                     div = E.div(child)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    67
##                     blockquote.insert(index, div)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    68
##             elif blockquote.text:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    69
##                 div = E.div(blockquote.text)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    70
##                 blockquote.text = None
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    71
##                 blockquote.append(div)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    72
        data = etree.tostring(tree)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    73
        return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    74
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    75
   
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    76
class SaxOnlyValidator(Validator):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    77
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    78
    def __init__(self):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    79
        Validator.__init__(self)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    80
        self.parser = etree.XMLParser()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    81
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    82
class HTMLValidator(Validator):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    83
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    84
    def __init__(self):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    85
        Validator.__init__(self)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    86
        self.parser = etree.HTMLParser()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    87
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    88
    
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    89
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    90
class PageInfo(object):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    91
    """holds various informations on the view's output"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    92
    def __init__(self, source, root):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    93
        self.source = source
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    94
        self.etree = root
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    95
        self.source = source
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    96
        self.raw_text = u''.join(root.xpath('//text()'))
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    97
        self.namespace = self.etree.nsmap
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    98
        self.default_ns = self.namespace.get(None)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    99
        self.a_tags = self.find_tag('a')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   100
        self.h1_tags = self.find_tag('h1')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   101
        self.h2_tags = self.find_tag('h2')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   102
        self.h3_tags = self.find_tag('h3')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   103
        self.h4_tags = self.find_tag('h4')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   104
        self.input_tags = self.find_tag('input')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   105
        self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   106
        
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   107
    def find_tag(self, tag):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   108
        """return a list which contains text of all "tag" elements """
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   109
        if self.default_ns is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   110
            iterstr = ".//%s" % tag
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   111
        else:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   112
            iterstr = ".//{%s}%s" % (self.default_ns, tag)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   113
        if tag in ('a', 'input'):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   114
            return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   115
        return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   116
         
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   117
    def appears(self, text):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   118
        """returns True if <text> appears in the page"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   119
        return text in self.raw_text
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   120
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   121
    def __contains__(self, text):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   122
        return text in self.source
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   123
    
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   124
    def has_title(self, text, level=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   125
        """returns True if <h?>text</h?>
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   126
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   127
        :param level: the title's level (1 for h1, 2 for h2, etc.)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   128
        """
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   129
        if level is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   130
            for hlist in self.title_tags:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   131
                if text in hlist:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   132
                    return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   133
            return False
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   134
        else:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   135
            hlist = self.title_tags[level - 1]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   136
            return text in hlist
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   137
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   138
    def has_title_regexp(self, pattern, level=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   139
        """returns True if <h?>pattern</h?>"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   140
        sre = re.compile(pattern)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   141
        if level is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   142
            for hlist in self.title_tags:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   143
                for title in hlist:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   144
                    if sre.match(title):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   145
                        return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   146
            return False
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   147
        else:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   148
            hlist = self.title_tags[level - 1]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   149
            for title in hlist:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   150
                if sre.match(title):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   151
                    return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   152
            return False
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   153
    
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   154
    def has_link(self, text, url=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   155
        """returns True if <a href=url>text</a> was found in the page"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   156
        for link_text, attrs in self.a_tags:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   157
            if text == link_text:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   158
                if url is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   159
                    return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   160
                try:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   161
                    href = attrs['href']
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   162
                    if href == url:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   163
                        return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   164
                except KeyError:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   165
                    continue
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   166
        return False
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   167
    
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   168
    def has_link_regexp(self, pattern, url=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   169
        """returns True if <a href=url>pattern</a> was found in the page"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   170
        sre = re.compile(pattern)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   171
        for link_text, attrs in self.a_tags:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   172
            if sre.match(link_text):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   173
                if url is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   174
                    return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   175
                try:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   176
                    href = attrs['href']
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   177
                    if href == url:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   178
                        return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   179
                except KeyError:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   180
                    continue
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   181
        return False