devtools/htmlparser.py
changeset 1808 aa09e20dd8c0
parent 1485 4d532f3c012e
child 1945 2b59d9ae17ae
equal deleted inserted replaced
1693:49075f57cf2c 1808:aa09e20dd8c0
     1 """defines a validating HTML parser used in web application tests"""
     1 """defines a validating HTML parser used in web application tests"""
     2 
     2 
     3 import re
     3 import re
     4 from StringIO import StringIO
       
     5 
     4 
     6 from lxml import etree
     5 from lxml import etree
     7 from lxml.builder import E
       
     8 
     6 
     9 from cubicweb.common.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE, CW_XHTML_EXTENSIONS
     7 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
    10 
     8 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
    11 STRICT_DOCTYPE = str(STRICT_DOCTYPE % CW_XHTML_EXTENSIONS).strip()
     9 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    12 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE % CW_XHTML_EXTENSIONS).strip()
       
    13 
    10 
    14 ERR_COUNT = 0
    11 ERR_COUNT = 0
    15 
    12 
    16 class Validator(object):
    13 class Validator(object):
    17     
    14 
    18     def parse_string(self, data, sysid=None):
    15     def parse_string(self, data, sysid=None):
    19         try:
    16         try:
    20             data = self.preprocess_data(data)
    17             data = self.preprocess_data(data)
    21             return PageInfo(data, etree.fromstring(data, self.parser))
    18             return PageInfo(data, etree.fromstring(data, self.parser))
    22         except etree.XMLSyntaxError, exc:
    19         except etree.XMLSyntaxError, exc:
    53             blockquotes = tree.findall('.//blockquote')
    50             blockquotes = tree.findall('.//blockquote')
    54         # quick and dirty approach: remove all blockquotes
    51         # quick and dirty approach: remove all blockquotes
    55         for blockquote in blockquotes:
    52         for blockquote in blockquotes:
    56             parent = blockquote.getparent()
    53             parent = blockquote.getparent()
    57             parent.remove(blockquote)
    54             parent.remove(blockquote)
    58 ##         # for each blockquote, wrap unauthorized child in a div
       
    59 ##         for blockquote in blockquotes:
       
    60 ##             if len(blockquote):
       
    61 ##                 needs_wrap = [(index, child) for index, child in enumerate(blockquote)
       
    62 ##                               if child.tag not in expected]
       
    63 ##                 for index, child in needs_wrap:
       
    64 ##                     # the child is automatically popped from blockquote when
       
    65 ##                     # its parent is changed
       
    66 ##                     div = E.div(child)
       
    67 ##                     blockquote.insert(index, div)
       
    68 ##             elif blockquote.text:
       
    69 ##                 div = E.div(blockquote.text)
       
    70 ##                 blockquote.text = None
       
    71 ##                 blockquote.append(div)
       
    72         data = etree.tostring(tree)
    55         data = etree.tostring(tree)
    73         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data)
    56         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
       
    57             STRICT_DOCTYPE, data)
    74 
    58 
    75    
    59 
    76 class SaxOnlyValidator(Validator):
    60 class SaxOnlyValidator(Validator):
    77 
    61 
    78     def __init__(self):
    62     def __init__(self):
    79         Validator.__init__(self)
    63         Validator.__init__(self)
    80         self.parser = etree.XMLParser()
    64         self.parser = etree.XMLParser()
    83 
    67 
    84     def __init__(self):
    68     def __init__(self):
    85         Validator.__init__(self)
    69         Validator.__init__(self)
    86         self.parser = etree.HTMLParser()
    70         self.parser = etree.HTMLParser()
    87 
    71 
    88     
    72 
    89 
    73 
    90 class PageInfo(object):
    74 class PageInfo(object):
    91     """holds various informations on the view's output"""
    75     """holds various informations on the view's output"""
    92     def __init__(self, source, root):
    76     def __init__(self, source, root):
    93         self.source = source
    77         self.source = source
   101         self.h2_tags = self.find_tag('h2')
    85         self.h2_tags = self.find_tag('h2')
   102         self.h3_tags = self.find_tag('h3')
    86         self.h3_tags = self.find_tag('h3')
   103         self.h4_tags = self.find_tag('h4')
    87         self.h4_tags = self.find_tag('h4')
   104         self.input_tags = self.find_tag('input')
    88         self.input_tags = self.find_tag('input')
   105         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
    89         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
   106         
    90 
   107     def find_tag(self, tag):
    91     def find_tag(self, tag):
   108         """return a list which contains text of all "tag" elements """
    92         """return a list which contains text of all "tag" elements """
   109         if self.default_ns is None:
    93         if self.default_ns is None:
   110             iterstr = ".//%s" % tag
    94             iterstr = ".//%s" % tag
   111         else:
    95         else:
   112             iterstr = ".//{%s}%s" % (self.default_ns, tag)
    96             iterstr = ".//{%s}%s" % (self.default_ns, tag)
   113         if tag in ('a', 'input'):
    97         if tag in ('a', 'input'):
   114             return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
    98             return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
   115         return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
    99         return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
   116          
   100 
   117     def appears(self, text):
   101     def appears(self, text):
   118         """returns True if <text> appears in the page"""
   102         """returns True if <text> appears in the page"""
   119         return text in self.raw_text
   103         return text in self.raw_text
   120 
   104 
   121     def __contains__(self, text):
   105     def __contains__(self, text):
   122         return text in self.source
   106         return text in self.source
   123     
   107 
   124     def has_title(self, text, level=None):
   108     def has_title(self, text, level=None):
   125         """returns True if <h?>text</h?>
   109         """returns True if <h?>text</h?>
   126 
   110 
   127         :param level: the title's level (1 for h1, 2 for h2, etc.)
   111         :param level: the title's level (1 for h1, 2 for h2, etc.)
   128         """
   112         """
   148             hlist = self.title_tags[level - 1]
   132             hlist = self.title_tags[level - 1]
   149             for title in hlist:
   133             for title in hlist:
   150                 if sre.match(title):
   134                 if sre.match(title):
   151                     return True
   135                     return True
   152             return False
   136             return False
   153     
   137 
   154     def has_link(self, text, url=None):
   138     def has_link(self, text, url=None):
   155         """returns True if <a href=url>text</a> was found in the page"""
   139         """returns True if <a href=url>text</a> was found in the page"""
   156         for link_text, attrs in self.a_tags:
   140         for link_text, attrs in self.a_tags:
   157             if text == link_text:
   141             if text == link_text:
   158                 if url is None:
   142                 if url is None:
   162                     if href == url:
   146                     if href == url:
   163                         return True
   147                         return True
   164                 except KeyError:
   148                 except KeyError:
   165                     continue
   149                     continue
   166         return False
   150         return False
   167     
   151 
   168     def has_link_regexp(self, pattern, url=None):
   152     def has_link_regexp(self, pattern, url=None):
   169         """returns True if <a href=url>pattern</a> was found in the page"""
   153         """returns True if <a href=url>pattern</a> was found in the page"""
   170         sre = re.compile(pattern)
   154         sre = re.compile(pattern)
   171         for link_text, attrs in self.a_tags:
   155         for link_text, attrs in self.a_tags:
   172             if sre.match(link_text):
   156             if sre.match(link_text):