devtools/htmlparser.py
branchtls-sprint
changeset 1485 4d532f3c012e
parent 1480 d3e3d527daf5
child 1945 2b59d9ae17ae
equal deleted inserted replaced
1484:183da3addf0e 1485:4d532f3c012e
     3 import re
     3 import re
     4 
     4 
     5 from lxml import etree
     5 from lxml import etree
     6 
     6 
     7 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
     7 from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
       
     8 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
       
     9 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
     8 
    10 
     9 ERR_COUNT = 0
    11 ERR_COUNT = 0
    10 
    12 
    11 class Validator(object):
    13 class Validator(object):
    12     
    14 
    13     def parse_string(self, data, sysid=None):
    15     def parse_string(self, data, sysid=None):
    14         try:
    16         try:
    15             data = self.preprocess_data(data)
    17             data = self.preprocess_data(data)
    16             return PageInfo(data, etree.fromstring(data, self.parser))
    18             return PageInfo(data, etree.fromstring(data, self.parser))
    17         except etree.XMLSyntaxError, exc:
    19         except etree.XMLSyntaxError, exc:
    30         Validator.__init__(self)
    32         Validator.__init__(self)
    31         self.parser = etree.XMLParser(dtd_validation=True)
    33         self.parser = etree.XMLParser(dtd_validation=True)
    32 
    34 
    33     def preprocess_data(self, data):
    35     def preprocess_data(self, data):
    34         """used to fix potential blockquote mess generated by docutils"""
    36         """used to fix potential blockquote mess generated by docutils"""
    35         if str(STRICT_DOCTYPE) not in data:
    37         if STRICT_DOCTYPE not in data:
    36             return data
    38             return data
    37         # parse using transitional DTD
    39         # parse using transitional DTD
    38         data = data.replace(str(STRICT_DOCTYPE), str(TRANSITIONAL_DOCTYPE))
    40         data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
    39         tree = etree.fromstring(data, self.parser)
    41         tree = etree.fromstring(data, self.parser)
    40         namespace = tree.nsmap.get(None)
    42         namespace = tree.nsmap.get(None)
    41         # this is the list of authorized child tags for <blockquote> nodes
    43         # this is the list of authorized child tags for <blockquote> nodes
    42         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
    44         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
    43                    'fieldset table form noscript ins del script'.split()
    45                    'fieldset table form noscript ins del script'.split()
    49         # quick and dirty approach: remove all blockquotes
    51         # quick and dirty approach: remove all blockquotes
    50         for blockquote in blockquotes:
    52         for blockquote in blockquotes:
    51             parent = blockquote.getparent()
    53             parent = blockquote.getparent()
    52             parent.remove(blockquote)
    54             parent.remove(blockquote)
    53         data = etree.tostring(tree)
    55         data = etree.tostring(tree)
    54         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (str(STRICT_DOCTYPE), data)
    56         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
       
    57             STRICT_DOCTYPE, data)
    55 
    58 
    56    
    59 
    57 class SaxOnlyValidator(Validator):
    60 class SaxOnlyValidator(Validator):
    58 
    61 
    59     def __init__(self):
    62     def __init__(self):
    60         Validator.__init__(self)
    63         Validator.__init__(self)
    61         self.parser = etree.XMLParser()
    64         self.parser = etree.XMLParser()
    64 
    67 
    65     def __init__(self):
    68     def __init__(self):
    66         Validator.__init__(self)
    69         Validator.__init__(self)
    67         self.parser = etree.HTMLParser()
    70         self.parser = etree.HTMLParser()
    68 
    71 
    69     
    72 
    70 
    73 
    71 class PageInfo(object):
    74 class PageInfo(object):
    72     """holds various informations on the view's output"""
    75     """holds various informations on the view's output"""
    73     def __init__(self, source, root):
    76     def __init__(self, source, root):
    74         self.source = source
    77         self.source = source
    82         self.h2_tags = self.find_tag('h2')
    85         self.h2_tags = self.find_tag('h2')
    83         self.h3_tags = self.find_tag('h3')
    86         self.h3_tags = self.find_tag('h3')
    84         self.h4_tags = self.find_tag('h4')
    87         self.h4_tags = self.find_tag('h4')
    85         self.input_tags = self.find_tag('input')
    88         self.input_tags = self.find_tag('input')
    86         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
    89         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
    87         
    90 
    88     def find_tag(self, tag):
    91     def find_tag(self, tag):
    89         """return a list which contains text of all "tag" elements """
    92         """return a list which contains text of all "tag" elements """
    90         if self.default_ns is None:
    93         if self.default_ns is None:
    91             iterstr = ".//%s" % tag
    94             iterstr = ".//%s" % tag
    92         else:
    95         else:
    93             iterstr = ".//{%s}%s" % (self.default_ns, tag)
    96             iterstr = ".//{%s}%s" % (self.default_ns, tag)
    94         if tag in ('a', 'input'):
    97         if tag in ('a', 'input'):
    95             return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
    98             return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
    96         return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
    99         return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
    97          
   100 
    98     def appears(self, text):
   101     def appears(self, text):
    99         """returns True if <text> appears in the page"""
   102         """returns True if <text> appears in the page"""
   100         return text in self.raw_text
   103         return text in self.raw_text
   101 
   104 
   102     def __contains__(self, text):
   105     def __contains__(self, text):
   103         return text in self.source
   106         return text in self.source
   104     
   107 
   105     def has_title(self, text, level=None):
   108     def has_title(self, text, level=None):
   106         """returns True if <h?>text</h?>
   109         """returns True if <h?>text</h?>
   107 
   110 
   108         :param level: the title's level (1 for h1, 2 for h2, etc.)
   111         :param level: the title's level (1 for h1, 2 for h2, etc.)
   109         """
   112         """
   129             hlist = self.title_tags[level - 1]
   132             hlist = self.title_tags[level - 1]
   130             for title in hlist:
   133             for title in hlist:
   131                 if sre.match(title):
   134                 if sre.match(title):
   132                     return True
   135                     return True
   133             return False
   136             return False
   134     
   137 
   135     def has_link(self, text, url=None):
   138     def has_link(self, text, url=None):
   136         """returns True if <a href=url>text</a> was found in the page"""
   139         """returns True if <a href=url>text</a> was found in the page"""
   137         for link_text, attrs in self.a_tags:
   140         for link_text, attrs in self.a_tags:
   138             if text == link_text:
   141             if text == link_text:
   139                 if url is None:
   142                 if url is None:
   143                     if href == url:
   146                     if href == url:
   144                         return True
   147                         return True
   145                 except KeyError:
   148                 except KeyError:
   146                     continue
   149                     continue
   147         return False
   150         return False
   148     
   151 
   149     def has_link_regexp(self, pattern, url=None):
   152     def has_link_regexp(self, pattern, url=None):
   150         """returns True if <a href=url>pattern</a> was found in the page"""
   153         """returns True if <a href=url>pattern</a> was found in the page"""
   151         sre = re.compile(pattern)
   154         sre = re.compile(pattern)
   152         for link_text, attrs in self.a_tags:
   155         for link_text, attrs in self.a_tags:
   153             if sre.match(link_text):
   156             if sre.match(link_text):