1 """defines a validating HTML parser used in web application tests""" |
1 """defines a validating HTML parser used in web application tests""" |
2 |
2 |
3 import re |
3 import re |
4 from StringIO import StringIO |
|
5 |
4 |
6 from lxml import etree |
5 from lxml import etree |
7 from lxml.builder import E |
|
8 |
6 |
9 from cubicweb.common.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE, CW_XHTML_EXTENSIONS |
7 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE |
10 |
8 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
11 STRICT_DOCTYPE = str(STRICT_DOCTYPE % CW_XHTML_EXTENSIONS).strip() |
9 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
12 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE % CW_XHTML_EXTENSIONS).strip() |
|
13 |
10 |
14 ERR_COUNT = 0 |
11 ERR_COUNT = 0 |
15 |
12 |
16 class Validator(object): |
13 class Validator(object): |
17 |
14 |
18 def parse_string(self, data, sysid=None): |
15 def parse_string(self, data, sysid=None): |
19 try: |
16 try: |
20 data = self.preprocess_data(data) |
17 data = self.preprocess_data(data) |
21 return PageInfo(data, etree.fromstring(data, self.parser)) |
18 return PageInfo(data, etree.fromstring(data, self.parser)) |
22 except etree.XMLSyntaxError, exc: |
19 except etree.XMLSyntaxError, exc: |
53 blockquotes = tree.findall('.//blockquote') |
50 blockquotes = tree.findall('.//blockquote') |
54 # quick and dirty approach: remove all blockquotes |
51 # quick and dirty approach: remove all blockquotes |
55 for blockquote in blockquotes: |
52 for blockquote in blockquotes: |
56 parent = blockquote.getparent() |
53 parent = blockquote.getparent() |
57 parent.remove(blockquote) |
54 parent.remove(blockquote) |
58 ## # for each blockquote, wrap unauthorized child in a div |
|
59 ## for blockquote in blockquotes: |
|
60 ## if len(blockquote): |
|
61 ## needs_wrap = [(index, child) for index, child in enumerate(blockquote) |
|
62 ## if child.tag not in expected] |
|
63 ## for index, child in needs_wrap: |
|
64 ## # the child is automatically popped from blockquote when |
|
65 ## # its parent is changed |
|
66 ## div = E.div(child) |
|
67 ## blockquote.insert(index, div) |
|
68 ## elif blockquote.text: |
|
69 ## div = E.div(blockquote.text) |
|
70 ## blockquote.text = None |
|
71 ## blockquote.append(div) |
|
72 data = etree.tostring(tree) |
55 data = etree.tostring(tree) |
73 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data) |
56 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % ( |
|
57 STRICT_DOCTYPE, data) |
74 |
58 |
75 |
59 |
76 class SaxOnlyValidator(Validator): |
60 class SaxOnlyValidator(Validator): |
77 |
61 |
78 def __init__(self): |
62 def __init__(self): |
79 Validator.__init__(self) |
63 Validator.__init__(self) |
80 self.parser = etree.XMLParser() |
64 self.parser = etree.XMLParser() |
83 |
67 |
84 def __init__(self): |
68 def __init__(self): |
85 Validator.__init__(self) |
69 Validator.__init__(self) |
86 self.parser = etree.HTMLParser() |
70 self.parser = etree.HTMLParser() |
87 |
71 |
88 |
72 |
89 |
73 |
90 class PageInfo(object): |
74 class PageInfo(object): |
91 """holds various informations on the view's output""" |
75 """holds various informations on the view's output""" |
92 def __init__(self, source, root): |
76 def __init__(self, source, root): |
93 self.source = source |
77 self.source = source |
101 self.h2_tags = self.find_tag('h2') |
85 self.h2_tags = self.find_tag('h2') |
102 self.h3_tags = self.find_tag('h3') |
86 self.h3_tags = self.find_tag('h3') |
103 self.h4_tags = self.find_tag('h4') |
87 self.h4_tags = self.find_tag('h4') |
104 self.input_tags = self.find_tag('input') |
88 self.input_tags = self.find_tag('input') |
105 self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] |
89 self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] |
106 |
90 |
107 def find_tag(self, tag): |
91 def find_tag(self, tag): |
108 """return a list which contains text of all "tag" elements """ |
92 """return a list which contains text of all "tag" elements """ |
109 if self.default_ns is None: |
93 if self.default_ns is None: |
110 iterstr = ".//%s" % tag |
94 iterstr = ".//%s" % tag |
111 else: |
95 else: |
112 iterstr = ".//{%s}%s" % (self.default_ns, tag) |
96 iterstr = ".//{%s}%s" % (self.default_ns, tag) |
113 if tag in ('a', 'input'): |
97 if tag in ('a', 'input'): |
114 return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)] |
98 return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)] |
115 return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)] |
99 return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)] |
116 |
100 |
117 def appears(self, text): |
101 def appears(self, text): |
118 """returns True if <text> appears in the page""" |
102 """returns True if <text> appears in the page""" |
119 return text in self.raw_text |
103 return text in self.raw_text |
120 |
104 |
121 def __contains__(self, text): |
105 def __contains__(self, text): |
122 return text in self.source |
106 return text in self.source |
123 |
107 |
124 def has_title(self, text, level=None): |
108 def has_title(self, text, level=None): |
125 """returns True if <h?>text</h?> |
109 """returns True if <h?>text</h?> |
126 |
110 |
127 :param level: the title's level (1 for h1, 2 for h2, etc.) |
111 :param level: the title's level (1 for h1, 2 for h2, etc.) |
128 """ |
112 """ |
148 hlist = self.title_tags[level - 1] |
132 hlist = self.title_tags[level - 1] |
149 for title in hlist: |
133 for title in hlist: |
150 if sre.match(title): |
134 if sre.match(title): |
151 return True |
135 return True |
152 return False |
136 return False |
153 |
137 |
154 def has_link(self, text, url=None): |
138 def has_link(self, text, url=None): |
155 """returns True if <a href=url>text</a> was found in the page""" |
139 """returns True if <a href=url>text</a> was found in the page""" |
156 for link_text, attrs in self.a_tags: |
140 for link_text, attrs in self.a_tags: |
157 if text == link_text: |
141 if text == link_text: |
158 if url is None: |
142 if url is None: |
162 if href == url: |
146 if href == url: |
163 return True |
147 return True |
164 except KeyError: |
148 except KeyError: |
165 continue |
149 continue |
166 return False |
150 return False |
167 |
151 |
168 def has_link_regexp(self, pattern, url=None): |
152 def has_link_regexp(self, pattern, url=None): |
169 """returns True if <a href=url>pattern</a> was found in the page""" |
153 """returns True if <a href=url>pattern</a> was found in the page""" |
170 sre = re.compile(pattern) |
154 sre = re.compile(pattern) |
171 for link_text, attrs in self.a_tags: |
155 for link_text, attrs in self.a_tags: |
172 if sre.match(link_text): |
156 if sre.match(link_text): |