devtools/htmlparser.py
author Dimitri Papadopoulos <dimitri.papadopoulos@cea.fr>
Mon, 19 Oct 2015 23:39:30 +0200
changeset 10948 3ffacbdf7e9c
parent 10696 4ba4be5553cf
permissions -rw-r--r--
[skel] remove (what looks like) a spurious copy/paste
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
8937
8a1809c9a043 [htmlparser] add missing deprecation message
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8695
diff changeset
     1
# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
5421
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
     2
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
     3
#
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
     4
# This file is part of CubicWeb.
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
     5
#
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
     6
# CubicWeb is free software: you can redistribute it and/or modify it under the
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
     7
# terms of the GNU Lesser General Public License as published by the Free
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
     8
# Software Foundation, either version 2.1 of the License, or (at your option)
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
     9
# any later version.
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
    10
#
5424
8ecbcbff9777 replace logilab-common by CubicWeb in disclaimer
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5421
diff changeset
    11
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
5421
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
    12
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
    13
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
    14
# details.
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
    15
#
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
    16
# You should have received a copy of the GNU Lesser General Public License along
8167de96c523 proper licensing information (LGPL-2.1). Hope I get it right this time.
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5276
diff changeset
    17
# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
6771
da71f1ad1721 minor code cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5424
diff changeset
    18
"""defines a validating HTML parser used in web application tests"""
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    19
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    20
import re
3325
44caeccd2db9 fix sys import
Julien Jehannet <julien.jehannet@logilab.fr>
parents: 3151
diff changeset
    21
import sys
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    22
from xml import sax
10618
3274a1648c7e [py3k] io.BytesIO
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10588
diff changeset
    23
from io import BytesIO
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    24
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    25
from lxml import etree
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    26
8938
198fdadafed6 [htmlparser] rename SaxOnlyValidator to XMLValidator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8937
diff changeset
    27
from logilab.common.deprecation import class_deprecated, class_renamed
6772
68bb0943d192 [test, html validation] make validator selection somewhat smarter (at least it works properly when content is demoted from xhtml to html, making the XMLDemotingValidator class useless
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6771
diff changeset
    28
1421
77ee26df178f doc type handling refactoring: do the ext substitution at the module level
sylvain.thenault@logilab.fr
parents: 1132
diff changeset
    29
from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
6771
da71f1ad1721 minor code cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5424
diff changeset
    30
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
    31
STRICT_DOCTYPE = str(STRICT_DOCTYPE)
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
    32
TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    33
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    34
ERR_COUNT = 0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    35
10696
4ba4be5553cf [py3k] unicode vs str vs bytes vs the world
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10662
diff changeset
    36
_REM_SCRIPT_RGX = re.compile(br"<script[^>]*>.*?</script>", re.M|re.I|re.S)
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    37
def _remove_script_tags(data):
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    38
    """Remove the script (usually javascript) tags to help the lxml
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    39
    XMLParser / HTMLParser do their job. Without that, they choke on
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    40
    tags embedded in JS strings.
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    41
    """
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    42
    # Notice we may want to use lxml cleaner, but it's far too intrusive:
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    43
    #
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    44
    # cleaner = Cleaner(scripts=True,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    45
    #                   javascript=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    46
    #                   comments=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    47
    #                   style=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    48
    #                   links=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    49
    #                   meta=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    50
    #                   page_structure=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    51
    #                   processing_instructions=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    52
    #                   embedded=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    53
    #                   frames=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    54
    #                   forms=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    55
    #                   annoying_tags=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    56
    #                   remove_tags=(),
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    57
    #                   remove_unknown_tags=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    58
    #                   safe_attrs_only=False,
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    59
    #                   add_nofollow=False)
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    60
    # >>> cleaner.clean_html('<body></body>')
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    61
    # '<span></span>'
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    62
    # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    63
    # '<html><body></body></html>'
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    64
    # >>> cleaner.clean_html('<body><div/></body>')
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    65
    # '<div></div>'
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    66
    # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    67
    # '<html><body><div></div><br></body></html>'
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    68
    # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    69
    # '<html><body><div></div><br><span></span></body></html>'
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    70
    #
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    71
    # using that, we'll miss most actual validation error we want to
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    72
    # catch. For now, use dumb regexp
10696
4ba4be5553cf [py3k] unicode vs str vs bytes vs the world
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10662
diff changeset
    73
    return _REM_SCRIPT_RGX.sub(b'', data)
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    74
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    75
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    76
class Validator(object):
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
    77
    """ base validator API """
8973
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    78
    parser = None
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
    79
8973
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    80
    def parse_string(self, source):
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    81
        etree = self._parse(self.preprocess_data(source))
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    82
        return PageInfo(source, etree)
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    83
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    84
    def preprocess_data(self, data):
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    85
        return data
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    86
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    87
    def _parse(self, pdata):
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    88
        try:
8973
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
    89
            return etree.fromstring(pdata, self.parser)
8695
358d8bed9626 [toward-py3k] rewrite to "except AnException as exc:" (part of #2711624)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 7014
diff changeset
    90
        except etree.XMLSyntaxError as exc:
8941
7b26fe71404f drop xhtml content-type support (closes #2065651)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8940
diff changeset
    91
            new_exc = AssertionError(u'invalid document: %s' % exc)
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    92
            new_exc.position = exc.position
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    93
            raise new_exc
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    94
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    95
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    96
class DTDValidator(Validator):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    97
    def __init__(self):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
    98
        Validator.__init__(self)
3151
5d45c0945bd3 note about this test under windows
Aurelien Campeas
parents: 1977
diff changeset
    99
        # XXX understand what's happening under windows
6771
da71f1ad1721 minor code cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 5424
diff changeset
   100
        self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32')
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   101
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   102
    def preprocess_data(self, data):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   103
        """used to fix potential blockquote mess generated by docutils"""
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   104
        if STRICT_DOCTYPE not in data:
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   105
            return data
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   106
        # parse using transitional DTD
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   107
        data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
8979
8f5416b1562a [devtools] use self._parse so AssertionError is properly raised instead of lxml error (test failure introduced in 6711f78c18be)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8977
diff changeset
   108
        tree = self._parse(data)
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   109
        namespace = tree.nsmap.get(None)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   110
        # this is the list of authorized child tags for <blockquote> nodes
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   111
        expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   112
                   'fieldset table form noscript ins del script'.split()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   113
        if namespace:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   114
            blockquotes = tree.findall('.//{%s}blockquote' % namespace)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   115
            expected = ['{%s}%s' % (namespace, tag) for tag in expected]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   116
        else:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   117
            blockquotes = tree.findall('.//blockquote')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   118
        # quick and dirty approach: remove all blockquotes
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   119
        for blockquote in blockquotes:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   120
            parent = blockquote.getparent()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   121
            parent.remove(blockquote)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   122
        data = etree.tostring(tree)
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   123
        return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   124
            STRICT_DOCTYPE, data)
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   125
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   126
8938
198fdadafed6 [htmlparser] rename SaxOnlyValidator to XMLValidator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8937
diff changeset
   127
class XMLValidator(Validator):
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   128
    """XML validator, checks that XML is well-formed and used XMLNS are defined"""
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   129
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   130
    def __init__(self):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   131
        Validator.__init__(self)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   132
        self.parser = etree.XMLParser()
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   133
8938
198fdadafed6 [htmlparser] rename SaxOnlyValidator to XMLValidator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8937
diff changeset
   134
SaxOnlyValidator = class_renamed('SaxOnlyValidator',
198fdadafed6 [htmlparser] rename SaxOnlyValidator to XMLValidator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8937
diff changeset
   135
                                 XMLValidator,
198fdadafed6 [htmlparser] rename SaxOnlyValidator to XMLValidator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8937
diff changeset
   136
                                 '[3.17] you should use the '
198fdadafed6 [htmlparser] rename SaxOnlyValidator to XMLValidator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8937
diff changeset
   137
                                 'XMLValidator class instead')
198fdadafed6 [htmlparser] rename SaxOnlyValidator to XMLValidator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8937
diff changeset
   138
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   139
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   140
class XMLSyntaxValidator(Validator):
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   141
    """XML syntax validator, check XML is well-formed"""
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   142
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   143
    class MySaxErrorHandler(sax.ErrorHandler):
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   144
        """override default handler to avoid choking because of unknown entity"""
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   145
        def fatalError(self, exception):
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   146
            # XXX check entity in htmlentitydefs
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   147
            if not str(exception).endswith('undefined entity'):
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   148
                raise exception
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   149
    _parser = sax.make_parser()
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   150
    _parser.setContentHandler(sax.handler.ContentHandler())
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   151
    _parser.setErrorHandler(MySaxErrorHandler())
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   152
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   153
    def __init__(self):
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   154
        super(XMLSyntaxValidator, self).__init__()
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   155
        # XMLParser() wants xml namespaces defined
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   156
        # XMLParser(recover=True) will accept almost anything
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   157
        #
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   158
        # -> use the later but preprocess will check xml well-formness using a
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   159
        #    dumb SAX parser
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   160
        self.parser = etree.XMLParser(recover=True)
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   161
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   162
    def preprocess_data(self, data):
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   163
        return _remove_script_tags(data)
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   164
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   165
    def _parse(self, data):
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   166
        inpsrc = sax.InputSource()
10618
3274a1648c7e [py3k] io.BytesIO
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10588
diff changeset
   167
        inpsrc.setByteStream(BytesIO(data))
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   168
        try:
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   169
            self._parser.parse(inpsrc)
10588
fdaa0e4b7eaf [py3k] except as
Samuel Trégouët <samuel.tregouet@logilab.fr>
parents: 10066
diff changeset
   170
        except sax.SAXParseException as exc:
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   171
            new_exc = AssertionError(u'invalid document: %s' % exc)
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   172
            new_exc.position = (exc._linenum, exc._colnum)
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   173
            raise new_exc
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   174
        return super(XMLSyntaxValidator, self)._parse(data)
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   175
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   176
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   177
class HTMLValidator(Validator):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   178
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   179
    def __init__(self):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   180
        Validator.__init__(self)
8940
ae898a084da2 [htmlparser] exclude <script> tag from html source
Pierre-Yves David <pierre-yves.david@logilab.fr>
parents: 8939
diff changeset
   181
        self.parser = etree.HTMLParser(recover=False)
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   182
8940
ae898a084da2 [htmlparser] exclude <script> tag from html source
Pierre-Yves David <pierre-yves.david@logilab.fr>
parents: 8939
diff changeset
   183
    def preprocess_data(self, data):
8977
57e564c0118e [testlib] introduce a validator that check xml-well formness
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8973
diff changeset
   184
        return _remove_script_tags(data)
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   185
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   186
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   187
class PageInfo(object):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   188
    """holds various informations on the view's output"""
8973
6711f78c18be [testlib] unspaghettify Validator / PageInfo api
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8972
diff changeset
   189
    def __init__(self, source, root):
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   190
        self.source = source
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   191
        self.etree = root
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   192
        self.raw_text = u''.join(root.xpath('//text()'))
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   193
        self.namespace = self.etree.nsmap
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   194
        self.default_ns = self.namespace.get(None)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   195
        self.a_tags = self.find_tag('a')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   196
        self.h1_tags = self.find_tag('h1')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   197
        self.h2_tags = self.find_tag('h2')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   198
        self.h3_tags = self.find_tag('h3')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   199
        self.h4_tags = self.find_tag('h4')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   200
        self.input_tags = self.find_tag('input')
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   201
        self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   202
7014
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   203
    def _iterstr(self, tag):
6977
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   204
        if self.default_ns is None:
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   205
            return ".//%s" % tag
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   206
        else:
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   207
            return ".//{%s}%s" % (self.default_ns, tag)
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   208
7014
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   209
    def matching_nodes(self, tag, **attrs):
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   210
        for elt in self.etree.iterfind(self._iterstr(tag)):
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   211
            eltattrs  = elt.attrib
10662
10942ed172de [py3k] dict.iteritems → dict.items
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10618
diff changeset
   212
            for attr, value in attrs.items():
7014
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   213
                try:
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   214
                    if eltattrs[attr] != value:
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   215
                        break
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   216
                except KeyError:
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   217
                    break
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   218
            else: # all attributes match
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   219
                yield elt
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   220
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   221
    def has_tag(self, tag, nboccurs=1, **attrs):
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   222
        """returns True if tag with given attributes appears in the page
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   223
        `nbtimes` (any if None)
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   224
        """
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   225
        for elt in self.matching_nodes(tag, **attrs):
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   226
            if nboccurs is None: # no need to check number of occurences
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   227
                return True
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   228
            if not nboccurs: # too much occurences
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   229
                return False
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   230
            nboccurs -= 1
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   231
        if nboccurs == 0: # correct number of occurences
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   232
            return True
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   233
        return False # no matching tag/attrs
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   234
1945
2b59d9ae17ae new argument telling if we want text or (text / attrs), keeping bw compat
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 1485
diff changeset
   235
    def find_tag(self, tag, gettext=True):
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   236
        """return a list which contains text of all "tag" elements """
7014
7e3e80f4179a [testlib pageinfo] extract matching_node method from has_tag to ease looking for a node with a given set of attributes
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6977
diff changeset
   237
        iterstr = self._iterstr(tag)
1945
2b59d9ae17ae new argument telling if we want text or (text / attrs), keeping bw compat
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 1485
diff changeset
   238
        if not gettext or tag in ('a', 'input'):
6977
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   239
            return [(elt.text, elt.attrib)
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   240
                    for elt in self.etree.iterfind(iterstr)]
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   241
        return [u''.join(elt.xpath('.//text()'))
cb78108bf603 [testlib] new method on page info object to ensure some tag with arbitrary attributes is found
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6772
diff changeset
   242
                for elt in self.etree.iterfind(iterstr)]
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   243
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   244
    def appears(self, text):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   245
        """returns True if <text> appears in the page"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   246
        return text in self.raw_text
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   247
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   248
    def __contains__(self, text):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   249
        return text in self.source
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   250
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   251
    def has_title(self, text, level=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   252
        """returns True if <h?>text</h?>
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   253
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   254
        :param level: the title's level (1 for h1, 2 for h2, etc.)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   255
        """
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   256
        if level is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   257
            for hlist in self.title_tags:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   258
                if text in hlist:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   259
                    return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   260
            return False
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   261
        else:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   262
            hlist = self.title_tags[level - 1]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   263
            return text in hlist
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   264
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   265
    def has_title_regexp(self, pattern, level=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   266
        """returns True if <h?>pattern</h?>"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   267
        sre = re.compile(pattern)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   268
        if level is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   269
            for hlist in self.title_tags:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   270
                for title in hlist:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   271
                    if sre.match(title):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   272
                        return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   273
            return False
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   274
        else:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   275
            hlist = self.title_tags[level - 1]
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   276
            for title in hlist:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   277
                if sre.match(title):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   278
                    return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   279
            return False
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   280
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   281
    def has_link(self, text, url=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   282
        """returns True if <a href=url>text</a> was found in the page"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   283
        for link_text, attrs in self.a_tags:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   284
            if text == link_text:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   285
                if url is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   286
                    return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   287
                try:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   288
                    href = attrs['href']
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   289
                    if href == url:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   290
                        return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   291
                except KeyError:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   292
                    continue
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   293
        return False
1485
4d532f3c012e nicer fix
sylvain.thenault@logilab.fr
parents: 1480
diff changeset
   294
0
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   295
    def has_link_regexp(self, pattern, url=None):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   296
        """returns True if <a href=url>pattern</a> was found in the page"""
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   297
        sre = re.compile(pattern)
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   298
        for link_text, attrs in self.a_tags:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   299
            if sre.match(link_text):
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   300
                if url is None:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   301
                    return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   302
                try:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   303
                    href = attrs['href']
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   304
                    if href == url:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   305
                        return True
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   306
                except KeyError:
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   307
                    continue
b97547f5f1fa Showtime !
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff changeset
   308
        return False
2773
b2530e3e0afb [testlib] #345052 and #344207: major test lib refactoring/cleanup + update usage
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 1977
diff changeset
   309
8972
771337c3a754 [testlib] update htmlparsers.VALMAP: stop using SaxOnlyValidator and add an entry for html
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8951
diff changeset
   310
VALMAP = {None: None,
771337c3a754 [testlib] update htmlparsers.VALMAP: stop using SaxOnlyValidator and add an entry for html
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8951
diff changeset
   311
          'dtd': DTDValidator,
771337c3a754 [testlib] update htmlparsers.VALMAP: stop using SaxOnlyValidator and add an entry for html
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8951
diff changeset
   312
          'xml': XMLValidator,
771337c3a754 [testlib] update htmlparsers.VALMAP: stop using SaxOnlyValidator and add an entry for html
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8951
diff changeset
   313
          'html': HTMLValidator,
771337c3a754 [testlib] update htmlparsers.VALMAP: stop using SaxOnlyValidator and add an entry for html
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8951
diff changeset
   314
          }