3 import re |
3 import re |
4 |
4 |
5 from lxml import etree |
5 from lxml import etree |
6 |
6 |
7 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE |
7 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE |
|
8 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
|
9 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
8 |
10 |
9 ERR_COUNT = 0 |
11 ERR_COUNT = 0 |
10 |
12 |
11 class Validator(object): |
13 class Validator(object): |
12 |
14 |
13 def parse_string(self, data, sysid=None): |
15 def parse_string(self, data, sysid=None): |
14 try: |
16 try: |
15 data = self.preprocess_data(data) |
17 data = self.preprocess_data(data) |
16 return PageInfo(data, etree.fromstring(data, self.parser)) |
18 return PageInfo(data, etree.fromstring(data, self.parser)) |
17 except etree.XMLSyntaxError, exc: |
19 except etree.XMLSyntaxError, exc: |
30 Validator.__init__(self) |
32 Validator.__init__(self) |
31 self.parser = etree.XMLParser(dtd_validation=True) |
33 self.parser = etree.XMLParser(dtd_validation=True) |
32 |
34 |
33 def preprocess_data(self, data): |
35 def preprocess_data(self, data): |
34 """used to fix potential blockquote mess generated by docutils""" |
36 """used to fix potential blockquote mess generated by docutils""" |
35 if str(STRICT_DOCTYPE) not in data: |
37 if STRICT_DOCTYPE not in data: |
36 return data |
38 return data |
37 # parse using transitional DTD |
39 # parse using transitional DTD |
38 data = data.replace(str(STRICT_DOCTYPE), str(TRANSITIONAL_DOCTYPE)) |
40 data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) |
39 tree = etree.fromstring(data, self.parser) |
41 tree = etree.fromstring(data, self.parser) |
40 namespace = tree.nsmap.get(None) |
42 namespace = tree.nsmap.get(None) |
41 # this is the list of authorized child tags for <blockquote> nodes |
43 # this is the list of authorized child tags for <blockquote> nodes |
42 expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
44 expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
43 'fieldset table form noscript ins del script'.split() |
45 'fieldset table form noscript ins del script'.split() |
49 # quick and dirty approach: remove all blockquotes |
51 # quick and dirty approach: remove all blockquotes |
50 for blockquote in blockquotes: |
52 for blockquote in blockquotes: |
51 parent = blockquote.getparent() |
53 parent = blockquote.getparent() |
52 parent.remove(blockquote) |
54 parent.remove(blockquote) |
53 data = etree.tostring(tree) |
55 data = etree.tostring(tree) |
54 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (str(STRICT_DOCTYPE), data) |
56 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % ( |
|
57 STRICT_DOCTYPE, data) |
55 |
58 |
56 |
59 |
57 class SaxOnlyValidator(Validator): |
60 class SaxOnlyValidator(Validator): |
58 |
61 |
59 def __init__(self): |
62 def __init__(self): |
60 Validator.__init__(self) |
63 Validator.__init__(self) |
61 self.parser = etree.XMLParser() |
64 self.parser = etree.XMLParser() |
64 |
67 |
65 def __init__(self): |
68 def __init__(self): |
66 Validator.__init__(self) |
69 Validator.__init__(self) |
67 self.parser = etree.HTMLParser() |
70 self.parser = etree.HTMLParser() |
68 |
71 |
69 |
72 |
70 |
73 |
71 class PageInfo(object): |
74 class PageInfo(object): |
72 """holds various informations on the view's output""" |
75 """holds various informations on the view's output""" |
73 def __init__(self, source, root): |
76 def __init__(self, source, root): |
74 self.source = source |
77 self.source = source |
82 self.h2_tags = self.find_tag('h2') |
85 self.h2_tags = self.find_tag('h2') |
83 self.h3_tags = self.find_tag('h3') |
86 self.h3_tags = self.find_tag('h3') |
84 self.h4_tags = self.find_tag('h4') |
87 self.h4_tags = self.find_tag('h4') |
85 self.input_tags = self.find_tag('input') |
88 self.input_tags = self.find_tag('input') |
86 self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] |
89 self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] |
87 |
90 |
88 def find_tag(self, tag): |
91 def find_tag(self, tag): |
89 """return a list which contains text of all "tag" elements """ |
92 """return a list which contains text of all "tag" elements """ |
90 if self.default_ns is None: |
93 if self.default_ns is None: |
91 iterstr = ".//%s" % tag |
94 iterstr = ".//%s" % tag |
92 else: |
95 else: |
93 iterstr = ".//{%s}%s" % (self.default_ns, tag) |
96 iterstr = ".//{%s}%s" % (self.default_ns, tag) |
94 if tag in ('a', 'input'): |
97 if tag in ('a', 'input'): |
95 return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)] |
98 return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)] |
96 return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)] |
99 return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)] |
97 |
100 |
98 def appears(self, text): |
101 def appears(self, text): |
99 """returns True if <text> appears in the page""" |
102 """returns True if <text> appears in the page""" |
100 return text in self.raw_text |
103 return text in self.raw_text |
101 |
104 |
102 def __contains__(self, text): |
105 def __contains__(self, text): |
103 return text in self.source |
106 return text in self.source |
104 |
107 |
105 def has_title(self, text, level=None): |
108 def has_title(self, text, level=None): |
106 """returns True if <h?>text</h?> |
109 """returns True if <h?>text</h?> |
107 |
110 |
108 :param level: the title's level (1 for h1, 2 for h2, etc.) |
111 :param level: the title's level (1 for h1, 2 for h2, etc.) |
109 """ |
112 """ |
129 hlist = self.title_tags[level - 1] |
132 hlist = self.title_tags[level - 1] |
130 for title in hlist: |
133 for title in hlist: |
131 if sre.match(title): |
134 if sre.match(title): |
132 return True |
135 return True |
133 return False |
136 return False |
134 |
137 |
135 def has_link(self, text, url=None): |
138 def has_link(self, text, url=None): |
136 """returns True if <a href=url>text</a> was found in the page""" |
139 """returns True if <a href=url>text</a> was found in the page""" |
137 for link_text, attrs in self.a_tags: |
140 for link_text, attrs in self.a_tags: |
138 if text == link_text: |
141 if text == link_text: |
139 if url is None: |
142 if url is None: |
143 if href == url: |
146 if href == url: |
144 return True |
147 return True |
145 except KeyError: |
148 except KeyError: |
146 continue |
149 continue |
147 return False |
150 return False |
148 |
151 |
149 def has_link_regexp(self, pattern, url=None): |
152 def has_link_regexp(self, pattern, url=None): |
150 """returns True if <a href=url>pattern</a> was found in the page""" |
153 """returns True if <a href=url>pattern</a> was found in the page""" |
151 sre = re.compile(pattern) |
154 sre = re.compile(pattern) |
152 for link_text, attrs in self.a_tags: |
155 for link_text, attrs in self.a_tags: |
153 if sre.match(link_text): |
156 if sre.match(link_text): |