diff -r 000000000000 -r b97547f5f1fa devtools/htmlparser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/devtools/htmlparser.py Wed Nov 05 15:52:50 2008 +0100 @@ -0,0 +1,181 @@ +"""defines a validating HTML parser used in web application tests""" + +import re +from StringIO import StringIO + +from lxml import etree +from lxml.builder import E + +from cubicweb.common.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE, CW_XHTML_EXTENSIONS + +STRICT_DOCTYPE = str(STRICT_DOCTYPE % CW_XHTML_EXTENSIONS).strip() +TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE % CW_XHTML_EXTENSIONS).strip() + +ERR_COUNT = 0 + +class Validator(object): + + def parse_string(self, data, sysid=None): + try: + data = self.preprocess_data(data) + return PageInfo(data, etree.fromstring(data, self.parser)) + except etree.XMLSyntaxError, exc: + def save_in(fname=''): + file(fname, 'w').write(data) + new_exc = AssertionError(u'invalid xml %s' % exc) + new_exc.position = exc.position + raise new_exc + + def preprocess_data(self, data): + return data + + +class DTDValidator(Validator): + def __init__(self): + Validator.__init__(self) + self.parser = etree.XMLParser(dtd_validation=True) + + def preprocess_data(self, data): + """used to fix potential blockquote mess generated by docutils""" + if STRICT_DOCTYPE not in data: + return data + # parse using transitional DTD + data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) + tree = etree.fromstring(data, self.parser) + namespace = tree.nsmap.get(None) + # this is the list of authorized child tags for
nodes + expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ + 'fieldset table form noscript ins del script'.split() + if namespace: + blockquotes = tree.findall('.//{%s}blockquote' % namespace) + expected = ['{%s}%s' % (namespace, tag) for tag in expected] + else: + blockquotes = tree.findall('.//blockquote') + # quick and dirty approach: remove all blockquotes + for blockquote in blockquotes: + parent = blockquote.getparent() + parent.remove(blockquote) +## # for each blockquote, wrap unauthorized child in a div +## for blockquote in blockquotes: +## if len(blockquote): +## needs_wrap = [(index, child) for index, child in enumerate(blockquote) +## if child.tag not in expected] +## for index, child in needs_wrap: +## # the child is automatically popped from blockquote when +## # its parent is changed +## div = E.div(child) +## blockquote.insert(index, div) +## elif blockquote.text: +## div = E.div(blockquote.text) +## blockquote.text = None +## blockquote.append(div) + data = etree.tostring(tree) + return '%s\n%s' % (STRICT_DOCTYPE, data) + + +class SaxOnlyValidator(Validator): + + def __init__(self): + Validator.__init__(self) + self.parser = etree.XMLParser() + +class HTMLValidator(Validator): + + def __init__(self): + Validator.__init__(self) + self.parser = etree.HTMLParser() + + + +class PageInfo(object): + """holds various informations on the view's output""" + def __init__(self, source, root): + self.source = source + self.etree = root + self.source = source + self.raw_text = u''.join(root.xpath('//text()')) + self.namespace = self.etree.nsmap + self.default_ns = self.namespace.get(None) + self.a_tags = self.find_tag('a') + self.h1_tags = self.find_tag('h1') + self.h2_tags = self.find_tag('h2') + self.h3_tags = self.find_tag('h3') + self.h4_tags = self.find_tag('h4') + self.input_tags = self.find_tag('input') + self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] + + def find_tag(self, tag): + """return a list which contains text of all "tag" elements """ + if self.default_ns is None: + iterstr = ".//%s" % tag + else: + iterstr = ".//{%s}%s" % (self.default_ns, tag) + if tag in ('a', 'input'): + return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)] + return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)] + + def appears(self, text): + """returns True if appears in the page""" + return text in self.raw_text + + def __contains__(self, text): + return text in self.source + + def has_title(self, text, level=None): + """returns True if text + + :param level: the title's level (1 for h1, 2 for h2, etc.) + """ + if level is None: + for hlist in self.title_tags: + if text in hlist: + return True + return False + else: + hlist = self.title_tags[level - 1] + return text in hlist + + def has_title_regexp(self, pattern, level=None): + """returns True if pattern""" + sre = re.compile(pattern) + if level is None: + for hlist in self.title_tags: + for title in hlist: + if sre.match(title): + return True + return False + else: + hlist = self.title_tags[level - 1] + for title in hlist: + if sre.match(title): + return True + return False + + def has_link(self, text, url=None): + """returns True if text was found in the page""" + for link_text, attrs in self.a_tags: + if text == link_text: + if url is None: + return True + try: + href = attrs['href'] + if href == url: + return True + except KeyError: + continue + return False + + def has_link_regexp(self, pattern, url=None): + """returns True if pattern was found in the page""" + sre = re.compile(pattern) + for link_text, attrs in self.a_tags: + if sre.match(link_text): + if url is None: + return True + try: + href = attrs['href'] + if href == url: + return True + except KeyError: + continue + return False