equal
deleted
inserted
replaced
31 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
31 STRICT_DOCTYPE = str(STRICT_DOCTYPE) |
32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
32 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) |
33 |
33 |
34 ERR_COUNT = 0 |
34 ERR_COUNT = 0 |
35 |
35 |
36 _REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S) |
36 _REM_SCRIPT_RGX = re.compile(br"<script[^>]*>.*?</script>", re.M|re.I|re.S) |
37 def _remove_script_tags(data): |
37 def _remove_script_tags(data): |
38 """Remove the script (usually javascript) tags to help the lxml |
38 """Remove the script (usually javascript) tags to help the lxml |
39 XMLParser / HTMLParser do their job. Without that, they choke on |
39 XMLParser / HTMLParser do their job. Without that, they choke on |
40 tags embedded in JS strings. |
40 tags embedded in JS strings. |
41 """ |
41 """ |
68 # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>') |
68 # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>') |
69 # '<html><body><div></div><br><span></span></body></html>' |
69 # '<html><body><div></div><br><span></span></body></html>' |
70 # |
70 # |
71 # using that, we'll miss most actual validation error we want to |
71 # using that, we'll miss most actual validation error we want to |
72 # catch. For now, use dumb regexp |
72 # catch. For now, use dumb regexp |
73 return _REM_SCRIPT_RGX.sub('', data) |
73 return _REM_SCRIPT_RGX.sub(b'', data) |
74 |
74 |
75 |
75 |
76 class Validator(object): |
76 class Validator(object): |
77 """ base validator API """ |
77 """ base validator API """ |
78 parser = None |
78 parser = None |