author | Adrien Di Mascio <Adrien.DiMascio@logilab.fr> |
Mon, 10 Nov 2008 19:33:55 +0100 | |
changeset 16 | a70ece4d9d1a |
parent 0 | b97547f5f1fa |
child 781 | 323656dd85a9 |
permissions | -rw-r--r-- |
"""defines a validating HTML parser used in web application tests""" import re from StringIO import StringIO from lxml import etree from lxml.builder import E from cubicweb.common.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE, CW_XHTML_EXTENSIONS STRICT_DOCTYPE = str(STRICT_DOCTYPE % CW_XHTML_EXTENSIONS).strip() TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE % CW_XHTML_EXTENSIONS).strip() ERR_COUNT = 0 class Validator(object): def parse_string(self, data, sysid=None): try: data = self.preprocess_data(data) return PageInfo(data, etree.fromstring(data, self.parser)) except etree.XMLSyntaxError, exc: def save_in(fname=''): file(fname, 'w').write(data) new_exc = AssertionError(u'invalid xml %s' % exc) new_exc.position = exc.position raise new_exc def preprocess_data(self, data): return data class DTDValidator(Validator): def __init__(self): Validator.__init__(self) self.parser = etree.XMLParser(dtd_validation=True) def preprocess_data(self, data): """used to fix potential blockquote mess generated by docutils""" if STRICT_DOCTYPE not in data: return data # parse using transitional DTD data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) tree = etree.fromstring(data, self.parser) namespace = tree.nsmap.get(None) # this is the list of authorized child tags for <blockquote> nodes expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ 'fieldset table form noscript ins del script'.split() if namespace: blockquotes = tree.findall('.//{%s}blockquote' % namespace) expected = ['{%s}%s' % (namespace, tag) for tag in expected] else: blockquotes = tree.findall('.//blockquote') # quick and dirty approach: remove all blockquotes for blockquote in blockquotes: parent = blockquote.getparent() parent.remove(blockquote) ## # for each blockquote, wrap unauthorized child in a div ## for blockquote in blockquotes: ## if len(blockquote): ## needs_wrap = [(index, child) for index, child in enumerate(blockquote) ## if child.tag not in expected] ## for index, child in needs_wrap: ## # the child is automatically popped from blockquote when ## # its parent is changed ## div = E.div(child) ## blockquote.insert(index, div) ## elif blockquote.text: ## div = E.div(blockquote.text) ## blockquote.text = None ## blockquote.append(div) data = etree.tostring(tree) return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data) class SaxOnlyValidator(Validator): def __init__(self): Validator.__init__(self) self.parser = etree.XMLParser() class HTMLValidator(Validator): def __init__(self): Validator.__init__(self) self.parser = etree.HTMLParser() class PageInfo(object): """holds various informations on the view's output""" def __init__(self, source, root): self.source = source self.etree = root self.source = source self.raw_text = u''.join(root.xpath('//text()')) self.namespace = self.etree.nsmap self.default_ns = self.namespace.get(None) self.a_tags = self.find_tag('a') self.h1_tags = self.find_tag('h1') self.h2_tags = self.find_tag('h2') self.h3_tags = self.find_tag('h3') self.h4_tags = self.find_tag('h4') self.input_tags = self.find_tag('input') self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] def find_tag(self, tag): """return a list which contains text of all "tag" elements """ if self.default_ns is None: iterstr = ".//%s" % tag else: iterstr = ".//{%s}%s" % (self.default_ns, tag) if tag in ('a', 'input'): return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)] return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)] def appears(self, text): """returns True if <text> appears in the page""" return text in self.raw_text def __contains__(self, text): return text in self.source def has_title(self, text, level=None): """returns True if <h?>text</h?> :param level: the title's level (1 for h1, 2 for h2, etc.) """ if level is None: for hlist in self.title_tags: if text in hlist: return True return False else: hlist = self.title_tags[level - 1] return text in hlist def has_title_regexp(self, pattern, level=None): """returns True if <h?>pattern</h?>""" sre = re.compile(pattern) if level is None: for hlist in self.title_tags: for title in hlist: if sre.match(title): return True return False else: hlist = self.title_tags[level - 1] for title in hlist: if sre.match(title): return True return False def has_link(self, text, url=None): """returns True if <a href=url>text</a> was found in the page""" for link_text, attrs in self.a_tags: if text == link_text: if url is None: return True try: href = attrs['href'] if href == url: return True except KeyError: continue return False def has_link_regexp(self, pattern, url=None): """returns True if <a href=url>pattern</a> was found in the page""" sre = re.compile(pattern) for link_text, attrs in self.a_tags: if sre.match(link_text): if url is None: return True try: href = attrs['href'] if href == url: return True except KeyError: continue return False