devtools/htmlparser.py
branchtls-sprint
changeset 1480 d3e3d527daf5
parent 1421 77ee26df178f
child 1485 4d532f3c012e
equal deleted inserted replaced
1477:b056a49c16dc 1480:d3e3d527daf5
    30         Validator.__init__(self)
    30         Validator.__init__(self)
    31         self.parser = etree.XMLParser(dtd_validation=True)
    31         self.parser = etree.XMLParser(dtd_validation=True)
    32 
    32 
    33     def preprocess_data(self, data):
    33     def preprocess_data(self, data):
    34         """used to fix potential blockquote mess generated by docutils"""
    34         """used to fix potential blockquote mess generated by docutils"""
    35         if STRICT_DOCTYPE not in data:
    35         if str(STRICT_DOCTYPE) not in data:
    36             return data
    36             return data
    37         # parse using transitional DTD
    37         # parse using transitional DTD
    38         data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
    38         data = data.replace(str(STRICT_DOCTYPE), str(TRANSITIONAL_DOCTYPE))
    39         tree = etree.fromstring(data, self.parser)
    39         tree = etree.fromstring(data, self.parser)
    40         namespace = tree.nsmap.get(None)
    40         namespace = tree.nsmap.get(None)
    41         # this is the list of authorized child tags for <blockquote> nodes
    41         # this is the list of authorized child tags for <blockquote> nodes
    42         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
    42         expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
    43                    'fieldset table form noscript ins del script'.split()
    43                    'fieldset table form noscript ins del script'.split()
    49         # quick and dirty approach: remove all blockquotes
    49         # quick and dirty approach: remove all blockquotes
    50         for blockquote in blockquotes:
    50         for blockquote in blockquotes:
    51             parent = blockquote.getparent()
    51             parent = blockquote.getparent()
    52             parent.remove(blockquote)
    52             parent.remove(blockquote)
    53         data = etree.tostring(tree)
    53         data = etree.tostring(tree)
    54         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data)
    54         return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (str(STRICT_DOCTYPE), data)
    55 
    55 
    56    
    56    
    57 class SaxOnlyValidator(Validator):
    57 class SaxOnlyValidator(Validator):
    58 
    58 
    59     def __init__(self):
    59     def __init__(self):