devtools/htmlparser.py
changeset 10696 4ba4be5553cf
parent 10662 10942ed172de
equal deleted inserted replaced
10695:321b99973b69 10696:4ba4be5553cf
    31 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
    31 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
    32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    33 
    33 
    34 ERR_COUNT = 0
    34 ERR_COUNT = 0
    35 
    35 
    36 _REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
    36 _REM_SCRIPT_RGX = re.compile(br"<script[^>]*>.*?</script>", re.M|re.I|re.S)
    37 def _remove_script_tags(data):
    37 def _remove_script_tags(data):
    38     """Remove the script (usually javascript) tags to help the lxml
    38     """Remove the script (usually javascript) tags to help the lxml
    39     XMLParser / HTMLParser do their job. Without that, they choke on
    39     XMLParser / HTMLParser do their job. Without that, they choke on
    40     tags embedded in JS strings.
    40     tags embedded in JS strings.
    41     """
    41     """
    68     # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
    68     # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
    69     # '<html><body><div></div><br><span></span></body></html>'
    69     # '<html><body><div></div><br><span></span></body></html>'
    70     #
    70     #
    71     # using that, we'll miss most actual validation error we want to
    71     # using that, we'll miss most actual validation error we want to
    72     # catch. For now, use dumb regexp
    72     # catch. For now, use dumb regexp
    73     return _REM_SCRIPT_RGX.sub('', data)
    73     return _REM_SCRIPT_RGX.sub(b'', data)
    74 
    74 
    75 
    75 
    76 class Validator(object):
    76 class Validator(object):
    77     """ base validator API """
    77     """ base validator API """
    78     parser = None
    78     parser = None