devtools/htmlparser.py
changeset 8940 ae898a084da2
parent 8939 30ce8eccfe3f
child 8941 7b26fe71404f
equal deleted inserted replaced
8939:30ce8eccfe3f 8940:ae898a084da2
   106         else:
   106         else:
   107             self.parser = etree.HTMLParser()
   107             self.parser = etree.HTMLParser()
   108         return data
   108         return data
   109 
   109 
   110 
   110 
       
   111 REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
       
   112 
   111 class HTMLValidator(Validator):
   113 class HTMLValidator(Validator):
   112 
   114 
   113     def __init__(self):
   115     def __init__(self):
   114         Validator.__init__(self)
   116         Validator.__init__(self)
   115         self.parser = etree.HTMLParser()
   117         self.parser = etree.HTMLParser(recover=False)
   116 
   118 
       
   119     def preprocess_data(self, data):
       
   120         """ Here we essentially wipe the javascript tags to help the HTMLParser
       
   121         do its job. Without that, it chokes on tags embedded in JS strings.
       
   122         """
       
   123         # Notice we may want to use lxml cleaner, but it's far too intrusive:
       
   124         #
       
   125         # cleaner = Cleaner(scripts=True,
       
   126         #                   javascript=False,
       
   127         #                   comments=False,
       
   128         #                   style=False,
       
   129         #                   links=False,
       
   130         #                   meta=False,
       
   131         #                   page_structure=False,
       
   132         #                   processing_instructions=False,
       
   133         #                   embedded=False,
       
   134         #                   frames=False,
       
   135         #                   forms=False,
       
   136         #                   annoying_tags=False,
       
   137         #                   remove_tags=(),
       
   138         #                   remove_unknown_tags=False,
       
   139         #                   safe_attrs_only=False,
       
   140         #                   add_nofollow=False)
       
   141         # >>> cleaner.clean_html('<body></body>')
       
   142         # '<span></span>'
       
   143         # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
       
   144         # '<html><body></body></html>'
       
   145         # >>> cleaner.clean_html('<body><div/></body>')
       
   146         # '<div></div>'
       
   147         # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
       
   148         # '<html><body><div></div><br></body></html>'
       
   149         # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
       
   150         # '<html><body><div></div><br><span></span></body></html>'
       
   151         #
       
   152         # using that, we'll miss most actual validation error we want to
       
   153         # catch. For now, use dumb regexp
       
   154         return REM_SCRIPT_RGX.sub('', data)
   117 
   155 
   118 
   156 
   119 class PageInfo(object):
   157 class PageInfo(object):
   120     """holds various informations on the view's output"""
   158     """holds various informations on the view's output"""
   121     def __init__(self, validator, source):
   159     def __init__(self, validator, source):