1977
|
1 |
"""defines a validating HTML parser used in web application tests |
|
2 |
|
|
3 |
:organization: Logilab |
|
4 |
:copyright: 2001-2009 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2. |
|
5 |
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr |
|
6 |
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses |
|
7 |
""" |
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
8 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
9 |
import re |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
10 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
11 |
from lxml import etree |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
12 |
|
1421
77ee26df178f
doc type handling refactoring: do the ext substitution at the module level
sylvain.thenault@logilab.fr
diff
changeset
|
13 |
from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE |
1485
|
14 |
STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
|
15 |
TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
16 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
17 |
ERR_COUNT = 0 |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
18 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
19 |
class Validator(object): |
1485
|
20 |
|
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
21 |
def parse_string(self, data, sysid=None): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
22 |
try: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
23 |
data = self.preprocess_data(data) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
24 |
return PageInfo(data, etree.fromstring(data, self.parser)) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
25 |
except etree.XMLSyntaxError, exc: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
26 |
def save_in(fname=''): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
27 |
file(fname, 'w').write(data) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
28 |
new_exc = AssertionError(u'invalid xml %s' % exc) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
29 |
new_exc.position = exc.position |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
30 |
raise new_exc |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
31 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
32 |
def preprocess_data(self, data): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
33 |
return data |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
34 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
35 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
36 |
class DTDValidator(Validator): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
37 |
def __init__(self): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
38 |
Validator.__init__(self) |
3151
|
39 |
# XXX understand what's happening under windows |
|
40 |
validate = True |
|
41 |
if sys.platform == 'win32': |
|
42 |
validate = False |
|
43 |
self.parser = etree.XMLParser(dtd_validation=validate) |
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
44 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
45 |
def preprocess_data(self, data): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
46 |
"""used to fix potential blockquote mess generated by docutils""" |
1485
|
47 |
if STRICT_DOCTYPE not in data: |
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
48 |
return data |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
49 |
# parse using transitional DTD |
1485
|
50 |
data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) |
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
51 |
tree = etree.fromstring(data, self.parser) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
52 |
namespace = tree.nsmap.get(None) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
53 |
# this is the list of authorized child tags for <blockquote> nodes |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
54 |
expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
55 |
'fieldset table form noscript ins del script'.split() |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
56 |
if namespace: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
57 |
blockquotes = tree.findall('.//{%s}blockquote' % namespace) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
58 |
expected = ['{%s}%s' % (namespace, tag) for tag in expected] |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
59 |
else: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
60 |
blockquotes = tree.findall('.//blockquote') |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
61 |
# quick and dirty approach: remove all blockquotes |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
62 |
for blockquote in blockquotes: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
63 |
parent = blockquote.getparent() |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
64 |
parent.remove(blockquote) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
65 |
data = etree.tostring(tree) |
1485
|
66 |
return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % ( |
|
67 |
STRICT_DOCTYPE, data) |
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
68 |
|
1485
|
69 |
|
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
70 |
class SaxOnlyValidator(Validator): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
71 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
72 |
def __init__(self): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
73 |
Validator.__init__(self) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
74 |
self.parser = etree.XMLParser() |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
75 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
76 |
class HTMLValidator(Validator): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
77 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
78 |
def __init__(self): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
79 |
Validator.__init__(self) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
80 |
self.parser = etree.HTMLParser() |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
81 |
|
1485
|
82 |
|
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
83 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
84 |
class PageInfo(object): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
85 |
"""holds various informations on the view's output""" |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
86 |
def __init__(self, source, root): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
87 |
self.source = source |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
88 |
self.etree = root |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
89 |
self.source = source |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
90 |
self.raw_text = u''.join(root.xpath('//text()')) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
91 |
self.namespace = self.etree.nsmap |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
92 |
self.default_ns = self.namespace.get(None) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
93 |
self.a_tags = self.find_tag('a') |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
94 |
self.h1_tags = self.find_tag('h1') |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
95 |
self.h2_tags = self.find_tag('h2') |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
96 |
self.h3_tags = self.find_tag('h3') |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
97 |
self.h4_tags = self.find_tag('h4') |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
98 |
self.input_tags = self.find_tag('input') |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
99 |
self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] |
1485
|
100 |
|
1945
2b59d9ae17ae
new argument telling if we want text or (text / attrs), keeping bw compat
Sylvain Thénault <sylvain.thenault@logilab.fr>
diff
changeset
|
101 |
def find_tag(self, tag, gettext=True): |
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
102 |
"""return a list which contains text of all "tag" elements """ |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
103 |
if self.default_ns is None: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
104 |
iterstr = ".//%s" % tag |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
105 |
else: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
106 |
iterstr = ".//{%s}%s" % (self.default_ns, tag) |
1945
2b59d9ae17ae
new argument telling if we want text or (text / attrs), keeping bw compat
Sylvain Thénault <sylvain.thenault@logilab.fr>
diff
changeset
|
107 |
if not gettext or tag in ('a', 'input'): |
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
108 |
return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)] |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
109 |
return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)] |
1485
|
110 |
|
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
111 |
def appears(self, text): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
112 |
"""returns True if <text> appears in the page""" |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
113 |
return text in self.raw_text |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
114 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
115 |
def __contains__(self, text): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
116 |
return text in self.source |
1485
|
117 |
|
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
118 |
def has_title(self, text, level=None): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
119 |
"""returns True if <h?>text</h?> |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
120 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
121 |
:param level: the title's level (1 for h1, 2 for h2, etc.) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
122 |
""" |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
123 |
if level is None: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
124 |
for hlist in self.title_tags: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
125 |
if text in hlist: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
126 |
return True |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
127 |
return False |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
128 |
else: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
129 |
hlist = self.title_tags[level - 1] |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
130 |
return text in hlist |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
131 |
|
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
132 |
def has_title_regexp(self, pattern, level=None): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
133 |
"""returns True if <h?>pattern</h?>""" |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
134 |
sre = re.compile(pattern) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
135 |
if level is None: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
136 |
for hlist in self.title_tags: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
137 |
for title in hlist: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
138 |
if sre.match(title): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
139 |
return True |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
140 |
return False |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
141 |
else: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
142 |
hlist = self.title_tags[level - 1] |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
143 |
for title in hlist: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
144 |
if sre.match(title): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
145 |
return True |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
146 |
return False |
1485
|
147 |
|
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
148 |
def has_link(self, text, url=None): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
149 |
"""returns True if <a href=url>text</a> was found in the page""" |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
150 |
for link_text, attrs in self.a_tags: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
151 |
if text == link_text: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
152 |
if url is None: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
153 |
return True |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
154 |
try: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
155 |
href = attrs['href'] |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
156 |
if href == url: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
157 |
return True |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
158 |
except KeyError: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
159 |
continue |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
160 |
return False |
1485
|
161 |
|
0
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
162 |
def has_link_regexp(self, pattern, url=None): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
163 |
"""returns True if <a href=url>pattern</a> was found in the page""" |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
164 |
sre = re.compile(pattern) |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
165 |
for link_text, attrs in self.a_tags: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
166 |
if sre.match(link_text): |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
167 |
if url is None: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
168 |
return True |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
169 |
try: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
170 |
href = attrs['href'] |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
171 |
if href == url: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
172 |
return True |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
173 |
except KeyError: |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
174 |
continue |
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents:
diff
changeset
|
175 |
return False |