1977
+ − 1
"""defines a validating HTML parser used in web application tests
+ − 2
+ − 3
:organization: Logilab
+ − 4
:copyright: 2001-2009 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2.
+ − 5
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
+ − 6
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses
+ − 7
"""
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 8
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 9
import re
3325
+ − 10
import sys
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 11
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 12
from lxml import etree
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 13
1421
77ee26df178f
doc type handling refactoring: do the ext substitution at the module level
sylvain.thenault@logilab.fr
diff
changeset
+ − 14
from cubicweb.view import STRICT_DOCTYPE , TRANSITIONAL_DOCTYPE
1485
+ − 15
STRICT_DOCTYPE = str ( STRICT_DOCTYPE )
+ − 16
TRANSITIONAL_DOCTYPE = str ( TRANSITIONAL_DOCTYPE )
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 17
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 18
ERR_COUNT = 0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 19
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 20
class Validator ( object ):
1485
+ − 21
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 22
def parse_string ( self , data , sysid = None ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 23
try :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 24
data = self . preprocess_data ( data )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 25
return PageInfo ( data , etree . fromstring ( data , self . parser ))
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 26
except etree . XMLSyntaxError , exc :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 27
def save_in ( fname = '' ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 28
file ( fname , 'w' ) . write ( data )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 29
new_exc = AssertionError ( u 'invalid xml %s ' % exc )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 30
new_exc . position = exc . position
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 31
raise new_exc
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 32
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 33
def preprocess_data ( self , data ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 34
return data
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 35
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 36
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 37
class DTDValidator ( Validator ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 38
def __init__ ( self ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 39
Validator . __init__ ( self )
3151
+ − 40
# XXX understand what's happening under windows
+ − 41
validate = True
+ − 42
if sys . platform == 'win32' :
+ − 43
validate = False
+ − 44
self . parser = etree . XMLParser ( dtd_validation = validate )
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 45
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 46
def preprocess_data ( self , data ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 47
"""used to fix potential blockquote mess generated by docutils"""
1485
+ − 48
if STRICT_DOCTYPE not in data :
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 49
return data
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 50
# parse using transitional DTD
1485
+ − 51
data = data . replace ( STRICT_DOCTYPE , TRANSITIONAL_DOCTYPE )
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 52
tree = etree . fromstring ( data , self . parser )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 53
namespace = tree . nsmap . get ( None )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 54
# this is the list of authorized child tags for <blockquote> nodes
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 55
expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 56
'fieldset table form noscript ins del script' . split ()
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 57
if namespace :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 58
blockquotes = tree . findall ( './/{ %s }blockquote' % namespace )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 59
expected = [ '{ %s } %s ' % ( namespace , tag ) for tag in expected ]
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 60
else :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 61
blockquotes = tree . findall ( './/blockquote' )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 62
# quick and dirty approach: remove all blockquotes
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 63
for blockquote in blockquotes :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 64
parent = blockquote . getparent ()
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 65
parent . remove ( blockquote )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 66
data = etree . tostring ( tree )
1485
+ − 67
return '<?xml version="1.0" encoding="UTF-8"?> %s \n %s ' % (
+ − 68
STRICT_DOCTYPE , data )
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 69
1485
+ − 70
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 71
class SaxOnlyValidator ( Validator ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 72
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 73
def __init__ ( self ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 74
Validator . __init__ ( self )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 75
self . parser = etree . XMLParser ()
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 76
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 77
class HTMLValidator ( Validator ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 78
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 79
def __init__ ( self ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 80
Validator . __init__ ( self )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 81
self . parser = etree . HTMLParser ()
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 82
1485
+ − 83
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 84
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 85
class PageInfo ( object ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 86
"""holds various informations on the view's output"""
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 87
def __init__ ( self , source , root ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 88
self . source = source
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 89
self . etree = root
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 90
self . source = source
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 91
self . raw_text = u '' . join ( root . xpath ( '//text()' ))
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 92
self . namespace = self . etree . nsmap
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 93
self . default_ns = self . namespace . get ( None )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 94
self . a_tags = self . find_tag ( 'a' )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 95
self . h1_tags = self . find_tag ( 'h1' )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 96
self . h2_tags = self . find_tag ( 'h2' )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 97
self . h3_tags = self . find_tag ( 'h3' )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 98
self . h4_tags = self . find_tag ( 'h4' )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 99
self . input_tags = self . find_tag ( 'input' )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 100
self . title_tags = [ self . h1_tags , self . h2_tags , self . h3_tags , self . h4_tags ]
1485
+ − 101
1945
2b59d9ae17ae
new argument telling if we want text or (text / attrs), keeping bw compat
Sylvain Thénault <sylvain.thenault@logilab.fr>
diff
changeset
+ − 102
def find_tag ( self , tag , gettext = True ):
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 103
"""return a list which contains text of all "tag" elements """
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 104
if self . default_ns is None :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 105
iterstr = ".// %s " % tag
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 106
else :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 107
iterstr = ".//{ %s } %s " % ( self . default_ns , tag )
1945
2b59d9ae17ae
new argument telling if we want text or (text / attrs), keeping bw compat
Sylvain Thénault <sylvain.thenault@logilab.fr>
diff
changeset
+ − 108
if not gettext or tag in ( 'a' , 'input' ):
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 109
return [( elt . text , elt . attrib ) for elt in self . etree . iterfind ( iterstr )]
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 110
return [ u '' . join ( elt . xpath ( './/text()' )) for elt in self . etree . iterfind ( iterstr )]
1485
+ − 111
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 112
def appears ( self , text ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 113
"""returns True if <text> appears in the page"""
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 114
return text in self . raw_text
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 115
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 116
def __contains__ ( self , text ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 117
return text in self . source
1485
+ − 118
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 119
def has_title ( self , text , level = None ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 120
"""returns True if <h?>text</h?>
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 121
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 122
:param level: the title's level (1 for h1, 2 for h2, etc.)
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 123
"""
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 124
if level is None :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 125
for hlist in self . title_tags :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 126
if text in hlist :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 127
return True
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 128
return False
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 129
else :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 130
hlist = self . title_tags [ level - 1 ]
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 131
return text in hlist
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 132
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 133
def has_title_regexp ( self , pattern , level = None ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 134
"""returns True if <h?>pattern</h?>"""
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 135
sre = re . compile ( pattern )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 136
if level is None :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 137
for hlist in self . title_tags :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 138
for title in hlist :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 139
if sre . match ( title ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 140
return True
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 141
return False
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 142
else :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 143
hlist = self . title_tags [ level - 1 ]
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 144
for title in hlist :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 145
if sre . match ( title ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 146
return True
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 147
return False
1485
+ − 148
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 149
def has_link ( self , text , url = None ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 150
"""returns True if <a href=url>text</a> was found in the page"""
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 151
for link_text , attrs in self . a_tags :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 152
if text == link_text :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 153
if url is None :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 154
return True
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 155
try :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 156
href = attrs [ 'href' ]
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 157
if href == url :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 158
return True
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 159
except KeyError :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 160
continue
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 161
return False
1485
+ − 162
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 163
def has_link_regexp ( self , pattern , url = None ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 164
"""returns True if <a href=url>pattern</a> was found in the page"""
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 165
sre = re . compile ( pattern )
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 166
for link_text , attrs in self . a_tags :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 167
if sre . match ( link_text ):
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 168
if url is None :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 169
return True
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 170
try :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 171
href = attrs [ 'href' ]
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 172
if href == url :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 173
return True
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 174
except KeyError :
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 175
continue
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
+ − 176
return False
2773
b2530e3e0afb
[testlib] #345052 and #344207: major test lib refactoring/cleanup + update usage
Sylvain Thénault <sylvain.thenault@logilab.fr>
diff
changeset
+ − 177
b2530e3e0afb
[testlib] #345052 and #344207: major test lib refactoring/cleanup + update usage
Sylvain Thénault <sylvain.thenault@logilab.fr>
diff
changeset
+ − 178
VALMAP = { None : None , 'dtd' : DTDValidator , 'xml' : SaxOnlyValidator }