devtools/htmlparser.py
changeset 0 b97547f5f1fa
child 781 323656dd85a9
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/devtools/htmlparser.py	Wed Nov 05 15:52:50 2008 +0100
@@ -0,0 +1,181 @@
+"""defines a validating HTML parser used in web application tests"""
+
+import re
+from StringIO import StringIO
+
+from lxml import etree
+from lxml.builder import E
+
+from cubicweb.common.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE, CW_XHTML_EXTENSIONS
+
+STRICT_DOCTYPE = str(STRICT_DOCTYPE % CW_XHTML_EXTENSIONS).strip()
+TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE % CW_XHTML_EXTENSIONS).strip()
+
+ERR_COUNT = 0
+
+class Validator(object):
+    
+    def parse_string(self, data, sysid=None):
+        try:
+            data = self.preprocess_data(data)
+            return PageInfo(data, etree.fromstring(data, self.parser))
+        except etree.XMLSyntaxError, exc:
+            def save_in(fname=''):
+                file(fname, 'w').write(data)
+            new_exc = AssertionError(u'invalid xml %s' % exc)
+            new_exc.position = exc.position
+            raise new_exc
+
+    def preprocess_data(self, data):
+        return data
+
+
+class DTDValidator(Validator):
+    def __init__(self):
+        Validator.__init__(self)
+        self.parser = etree.XMLParser(dtd_validation=True)
+
+    def preprocess_data(self, data):
+        """used to fix potential blockquote mess generated by docutils"""
+        if STRICT_DOCTYPE not in data:
+            return data
+        # parse using transitional DTD
+        data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
+        tree = etree.fromstring(data, self.parser)
+        namespace = tree.nsmap.get(None)
+        # this is the list of authorized child tags for <blockquote> nodes
+        expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
+                   'fieldset table form noscript ins del script'.split()
+        if namespace:
+            blockquotes = tree.findall('.//{%s}blockquote' % namespace)
+            expected = ['{%s}%s' % (namespace, tag) for tag in expected]
+        else:
+            blockquotes = tree.findall('.//blockquote')
+        # quick and dirty approach: remove all blockquotes
+        for blockquote in blockquotes:
+            parent = blockquote.getparent()
+            parent.remove(blockquote)
+##         # for each blockquote, wrap unauthorized child in a div
+##         for blockquote in blockquotes:
+##             if len(blockquote):
+##                 needs_wrap = [(index, child) for index, child in enumerate(blockquote)
+##                               if child.tag not in expected]
+##                 for index, child in needs_wrap:
+##                     # the child is automatically popped from blockquote when
+##                     # its parent is changed
+##                     div = E.div(child)
+##                     blockquote.insert(index, div)
+##             elif blockquote.text:
+##                 div = E.div(blockquote.text)
+##                 blockquote.text = None
+##                 blockquote.append(div)
+        data = etree.tostring(tree)
+        return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data)
+
+   
+class SaxOnlyValidator(Validator):
+
+    def __init__(self):
+        Validator.__init__(self)
+        self.parser = etree.XMLParser()
+
+class HTMLValidator(Validator):
+
+    def __init__(self):
+        Validator.__init__(self)
+        self.parser = etree.HTMLParser()
+
+    
+
+class PageInfo(object):
+    """holds various informations on the view's output"""
+    def __init__(self, source, root):
+        self.source = source
+        self.etree = root
+        self.source = source
+        self.raw_text = u''.join(root.xpath('//text()'))
+        self.namespace = self.etree.nsmap
+        self.default_ns = self.namespace.get(None)
+        self.a_tags = self.find_tag('a')
+        self.h1_tags = self.find_tag('h1')
+        self.h2_tags = self.find_tag('h2')
+        self.h3_tags = self.find_tag('h3')
+        self.h4_tags = self.find_tag('h4')
+        self.input_tags = self.find_tag('input')
+        self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
+        
+    def find_tag(self, tag):
+        """return a list which contains text of all "tag" elements """
+        if self.default_ns is None:
+            iterstr = ".//%s" % tag
+        else:
+            iterstr = ".//{%s}%s" % (self.default_ns, tag)
+        if tag in ('a', 'input'):
+            return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
+        return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
+         
+    def appears(self, text):
+        """returns True if <text> appears in the page"""
+        return text in self.raw_text
+
+    def __contains__(self, text):
+        return text in self.source
+    
+    def has_title(self, text, level=None):
+        """returns True if <h?>text</h?>
+
+        :param level: the title's level (1 for h1, 2 for h2, etc.)
+        """
+        if level is None:
+            for hlist in self.title_tags:
+                if text in hlist:
+                    return True
+            return False
+        else:
+            hlist = self.title_tags[level - 1]
+            return text in hlist
+
+    def has_title_regexp(self, pattern, level=None):
+        """returns True if <h?>pattern</h?>"""
+        sre = re.compile(pattern)
+        if level is None:
+            for hlist in self.title_tags:
+                for title in hlist:
+                    if sre.match(title):
+                        return True
+            return False
+        else:
+            hlist = self.title_tags[level - 1]
+            for title in hlist:
+                if sre.match(title):
+                    return True
+            return False
+    
+    def has_link(self, text, url=None):
+        """returns True if <a href=url>text</a> was found in the page"""
+        for link_text, attrs in self.a_tags:
+            if text == link_text:
+                if url is None:
+                    return True
+                try:
+                    href = attrs['href']
+                    if href == url:
+                        return True
+                except KeyError:
+                    continue
+        return False
+    
+    def has_link_regexp(self, pattern, url=None):
+        """returns True if <a href=url>pattern</a> was found in the page"""
+        sre = re.compile(pattern)
+        for link_text, attrs in self.a_tags:
+            if sre.match(link_text):
+                if url is None:
+                    return True
+                try:
+                    href = attrs['href']
+                    if href == url:
+                        return True
+                except KeyError:
+                    continue
+        return False