devtools/htmlparser.py
changeset 0 b97547f5f1fa
child 781 323656dd85a9
equal deleted inserted replaced
-1:000000000000 0:b97547f5f1fa
       
     1 """defines a validating HTML parser used in web application tests"""
       
     2 
       
     3 import re
       
     4 from StringIO import StringIO
       
     5 
       
     6 from lxml import etree
       
     7 from lxml.builder import E
       
     8 
       
     9 from cubicweb.common.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE, CW_XHTML_EXTENSIONS
       
    10 
       
    11 STRICT_DOCTYPE = str(STRICT_DOCTYPE % CW_XHTML_EXTENSIONS).strip()
       
    12 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE % CW_XHTML_EXTENSIONS).strip()
       
    13 
       
    14 ERR_COUNT = 0
       
    15 
       
    16 class Validator(object):
       
    17     
       
    18     def parse_string(self, data, sysid=None):
       
    19         try:
       
    20             data = self.preprocess_data(data)
       
    21             return PageInfo(data, etree.fromstring(data, self.parser))
       
    22         except etree.XMLSyntaxError, exc:
       
    23             def save_in(fname=''):
       
    24                 file(fname, 'w').write(data)
       
    25             new_exc = AssertionError(u'invalid xml %s' % exc)
       
    26             new_exc.position = exc.position
       
    27             raise new_exc
       
    28 
       
    29     def preprocess_data(self, data):
       
    30         return data
       
    31 
       
    32 
       
    33 class DTDValidator(Validator):
       
    34     def __init__(self):
       
    35         Validator.__init__(self)
       
    36         self.parser = etree.XMLParser(dtd_validation=True)
       
    37 
       
    38     def preprocess_data(self, data):
       
    39         """used to fix potential blockquote mess generated by docutils"""
       
    40         if STRICT_DOCTYPE not in data:
       
    41             return data
       
    42         # parse using transitional DTD
       
    43         data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
       
    44         tree = etree.fromstring(data, self.parser)
       
    45         namespace = tree.nsmap.get(None)
       
    46         # this is the list of authorized child tags for <blockquote> nodes
       
    47         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
       
    48                    'fieldset table form noscript ins del script'.split()
       
    49         if namespace:
       
    50             blockquotes = tree.findall('.//{%s}blockquote' % namespace)
       
    51             expected = ['{%s}%s' % (namespace, tag) for tag in expected]
       
    52         else:
       
    53             blockquotes = tree.findall('.//blockquote')
       
    54         # quick and dirty approach: remove all blockquotes
       
    55         for blockquote in blockquotes:
       
    56             parent = blockquote.getparent()
       
    57             parent.remove(blockquote)
       
    58 ##         # for each blockquote, wrap unauthorized child in a div
       
    59 ##         for blockquote in blockquotes:
       
    60 ##             if len(blockquote):
       
    61 ##                 needs_wrap = [(index, child) for index, child in enumerate(blockquote)
       
    62 ##                               if child.tag not in expected]
       
    63 ##                 for index, child in needs_wrap:
       
    64 ##                     # the child is automatically popped from blockquote when
       
    65 ##                     # its parent is changed
       
    66 ##                     div = E.div(child)
       
    67 ##                     blockquote.insert(index, div)
       
    68 ##             elif blockquote.text:
       
    69 ##                 div = E.div(blockquote.text)
       
    70 ##                 blockquote.text = None
       
    71 ##                 blockquote.append(div)
       
    72         data = etree.tostring(tree)
       
    73         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data)
       
    74 
       
    75    
       
    76 class SaxOnlyValidator(Validator):
       
    77 
       
    78     def __init__(self):
       
    79         Validator.__init__(self)
       
    80         self.parser = etree.XMLParser()
       
    81 
       
    82 class HTMLValidator(Validator):
       
    83 
       
    84     def __init__(self):
       
    85         Validator.__init__(self)
       
    86         self.parser = etree.HTMLParser()
       
    87 
       
    88     
       
    89 
       
    90 class PageInfo(object):
       
    91     """holds various informations on the view's output"""
       
    92     def __init__(self, source, root):
       
    93         self.source = source
       
    94         self.etree = root
       
    95         self.source = source
       
    96         self.raw_text = u''.join(root.xpath('//text()'))
       
    97         self.namespace = self.etree.nsmap
       
    98         self.default_ns = self.namespace.get(None)
       
    99         self.a_tags = self.find_tag('a')
       
   100         self.h1_tags = self.find_tag('h1')
       
   101         self.h2_tags = self.find_tag('h2')
       
   102         self.h3_tags = self.find_tag('h3')
       
   103         self.h4_tags = self.find_tag('h4')
       
   104         self.input_tags = self.find_tag('input')
       
   105         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
       
   106         
       
   107     def find_tag(self, tag):
       
   108         """return a list which contains text of all "tag" elements """
       
   109         if self.default_ns is None:
       
   110             iterstr = ".//%s" % tag
       
   111         else:
       
   112             iterstr = ".//{%s}%s" % (self.default_ns, tag)
       
   113         if tag in ('a', 'input'):
       
   114             return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
       
   115         return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
       
   116          
       
   117     def appears(self, text):
       
   118         """returns True if <text> appears in the page"""
       
   119         return text in self.raw_text
       
   120 
       
   121     def __contains__(self, text):
       
   122         return text in self.source
       
   123     
       
   124     def has_title(self, text, level=None):
       
   125         """returns True if <h?>text</h?>
       
   126 
       
   127         :param level: the title's level (1 for h1, 2 for h2, etc.)
       
   128         """
       
   129         if level is None:
       
   130             for hlist in self.title_tags:
       
   131                 if text in hlist:
       
   132                     return True
       
   133             return False
       
   134         else:
       
   135             hlist = self.title_tags[level - 1]
       
   136             return text in hlist
       
   137 
       
   138     def has_title_regexp(self, pattern, level=None):
       
   139         """returns True if <h?>pattern</h?>"""
       
   140         sre = re.compile(pattern)
       
   141         if level is None:
       
   142             for hlist in self.title_tags:
       
   143                 for title in hlist:
       
   144                     if sre.match(title):
       
   145                         return True
       
   146             return False
       
   147         else:
       
   148             hlist = self.title_tags[level - 1]
       
   149             for title in hlist:
       
   150                 if sre.match(title):
       
   151                     return True
       
   152             return False
       
   153     
       
   154     def has_link(self, text, url=None):
       
   155         """returns True if <a href=url>text</a> was found in the page"""
       
   156         for link_text, attrs in self.a_tags:
       
   157             if text == link_text:
       
   158                 if url is None:
       
   159                     return True
       
   160                 try:
       
   161                     href = attrs['href']
       
   162                     if href == url:
       
   163                         return True
       
   164                 except KeyError:
       
   165                     continue
       
   166         return False
       
   167     
       
   168     def has_link_regexp(self, pattern, url=None):
       
   169         """returns True if <a href=url>pattern</a> was found in the page"""
       
   170         sre = re.compile(pattern)
       
   171         for link_text, attrs in self.a_tags:
       
   172             if sre.match(link_text):
       
   173                 if url is None:
       
   174                     return True
       
   175                 try:
       
   176                     href = attrs['href']
       
   177                     if href == url:
       
   178                         return True
       
   179                 except KeyError:
       
   180                     continue
       
   181         return False