[htmlparser] store unaltered source in pageinfo.source
authorPierre-Yves David <pierre-yves.david@logilab.fr>
Fri, 26 Apr 2013 11:52:35 +0200
changeset 8939 30ce8eccfe3f
parent 8938 198fdadafed6
child 8940 ae898a084da2
[htmlparser] store unaltered source in pageinfo.source Do not store the parser to preserve the source aspect. Store the initial input in the source argument instead. This is very useful when parsing html. In such case we need to drop ``<script>`` tag.
devtools/htmlparser.py
--- a/devtools/htmlparser.py	Thu Apr 25 18:38:47 2013 +0200
+++ b/devtools/htmlparser.py	Fri Apr 26 11:52:35 2013 +0200
@@ -35,8 +35,7 @@
 
     def parse_string(self, data, sysid=None):
         try:
-            data = self.preprocess_data(data)
-            return PageInfo(data, etree.fromstring(data, self.parser))
+            return PageInfo(self, data)
         except etree.XMLSyntaxError as exc:
             def save_in(fname=''):
                 file(fname, 'w').write(data)
@@ -119,8 +118,9 @@
 
 class PageInfo(object):
     """holds various informations on the view's output"""
-    def __init__(self, source, root):
+    def __init__(self, validator, source):
         self.source = source
+        root = etree.fromstring(validator.preprocess_data(source), validator.parser)
         self.etree = root
         self.source = source
         self.raw_text = u''.join(root.xpath('//text()'))