30 Validator.__init__(self) |
30 Validator.__init__(self) |
31 self.parser = etree.XMLParser(dtd_validation=True) |
31 self.parser = etree.XMLParser(dtd_validation=True) |
32 |
32 |
33 def preprocess_data(self, data): |
33 def preprocess_data(self, data): |
34 """used to fix potential blockquote mess generated by docutils""" |
34 """used to fix potential blockquote mess generated by docutils""" |
35 if STRICT_DOCTYPE not in data: |
35 if str(STRICT_DOCTYPE) not in data: |
36 return data |
36 return data |
37 # parse using transitional DTD |
37 # parse using transitional DTD |
38 data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) |
38 data = data.replace(str(STRICT_DOCTYPE), str(TRANSITIONAL_DOCTYPE)) |
39 tree = etree.fromstring(data, self.parser) |
39 tree = etree.fromstring(data, self.parser) |
40 namespace = tree.nsmap.get(None) |
40 namespace = tree.nsmap.get(None) |
41 # this is the list of authorized child tags for <blockquote> nodes |
41 # this is the list of authorized child tags for <blockquote> nodes |
42 expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
42 expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ |
43 'fieldset table form noscript ins del script'.split() |
43 'fieldset table form noscript ins del script'.split() |
49 # quick and dirty approach: remove all blockquotes |
49 # quick and dirty approach: remove all blockquotes |
50 for blockquote in blockquotes: |
50 for blockquote in blockquotes: |
51 parent = blockquote.getparent() |
51 parent = blockquote.getparent() |
52 parent.remove(blockquote) |
52 parent.remove(blockquote) |
53 data = etree.tostring(tree) |
53 data = etree.tostring(tree) |
54 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (STRICT_DOCTYPE, data) |
54 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (str(STRICT_DOCTYPE), data) |
55 |
55 |
56 |
56 |
57 class SaxOnlyValidator(Validator): |
57 class SaxOnlyValidator(Validator): |
58 |
58 |
59 def __init__(self): |
59 def __init__(self): |