cubicweb: comparison devtools/htmlparser.py

equal deleted inserted replaced

-:aeb7d400ee92
+:57e564c0118e
 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
 """defines a validating HTML parser used in web application tests"""
 import re
 import sys
+from xml import sax
+from cStringIO import StringIO
 from lxml import etree
 from logilab.common.deprecation import class_deprecated, class_renamed
 STRICT_DOCTYPE = str(STRICT_DOCTYPE)
 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
 ERR_COUNT = 0
+_REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
+def _remove_script_tags(data):
+"""Remove the script (usually javascript) tags to help the lxml
+XMLParser / HTMLParser do their job. Without that, they choke on
+tags embedded in JS strings.
+"""
+# Notice we may want to use lxml cleaner, but it's far too intrusive:
+#
+# cleaner = Cleaner(scripts=True,
+#                   javascript=False,
+#                   comments=False,
+#                   style=False,
+#                   links=False,
+#                   meta=False,
+#                   page_structure=False,
+#                   processing_instructions=False,
+#                   embedded=False,
+#                   frames=False,
+#                   forms=False,
+#                   annoying_tags=False,
+#                   remove_tags=(),
+#                   remove_unknown_tags=False,
+#                   safe_attrs_only=False,
+#                   add_nofollow=False)
+# >>> cleaner.clean_html('<body></body>')
+# '<span></span>'
+# >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
+# '<html><body></body></html>'
+# >>> cleaner.clean_html('<body><div/></body>')
+# '<div></div>'
+# >>> cleaner.clean_html('<html><body><div/><br></body><html>')
+# '<html><body><div></div><br></body></html>'
+# >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
+# '<html><body><div></div><br><span></span></body></html>'
+#
+# using that, we'll miss most actual validation error we want to
+# catch. For now, use dumb regexp
+return _REM_SCRIPT_RGX.sub('', data)
 class Validator(object):
+""" base validator API """
 parser = None
 def parse_string(self, source):
 etree = self._parse(self.preprocess_data(source))
 return PageInfo(source, etree)
 return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
 STRICT_DOCTYPE, data)
 class XMLValidator(Validator):
-""" A fully compliant XML parser """
+"""XML validator, checks that XML is well-formed and used XMLNS are defined"""
 def __init__(self):
 Validator.__init__(self)
 self.parser = etree.XMLParser()
 SaxOnlyValidator = class_renamed('SaxOnlyValidator',
 XMLValidator,
 '[3.17] you should use the '
 'XMLValidator class instead')
+class XMLSyntaxValidator(Validator):
+"""XML syntax validator, check XML is well-formed"""
+class MySaxErrorHandler(sax.ErrorHandler):
+"""override default handler to avoid choking because of unknown entity"""
+def fatalError(self, exception):
+# XXX check entity in htmlentitydefs
+if not str(exception).endswith('undefined entity'):
+raise exception
+_parser = sax.make_parser()
+_parser.setContentHandler(sax.handler.ContentHandler())
+_parser.setErrorHandler(MySaxErrorHandler())
+def __init__(self):
+super(XMLSyntaxValidator, self).__init__()
+# XMLParser() wants xml namespaces defined
+# XMLParser(recover=True) will accept almost anything
+#
+# -> use the later but preprocess will check xml well-formness using a
+#    dumb SAX parser
+self.parser = etree.XMLParser(recover=True)
+def preprocess_data(self, data):
+return _remove_script_tags(data)
+def _parse(self, data):
+inpsrc = sax.InputSource()
+inpsrc.setByteStream(StringIO(data))
+try:
+self._parser.parse(inpsrc)
+except sax.SAXParseException, exc:
+new_exc = AssertionError(u'invalid document: %s' % exc)
+new_exc.position = (exc._linenum, exc._colnum)
+raise new_exc
+return super(XMLSyntaxValidator, self)._parse(data)
 class XMLDemotingValidator(XMLValidator):
 """ some views produce html instead of xhtml, using demote_to_html
 this is typically related to the use of external dependencies
 else:
 self.parser = etree.HTMLParser()
 return data
-REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
 class HTMLValidator(Validator):
 def __init__(self):
 Validator.__init__(self)
 self.parser = etree.HTMLParser(recover=False)
 def preprocess_data(self, data):
-""" Here we essentially wipe the javascript tags to help the HTMLParser
+return _remove_script_tags(data)
-do its job. Without that, it chokes on tags embedded in JS strings.
-"""
-# Notice we may want to use lxml cleaner, but it's far too intrusive:
-#
-# cleaner = Cleaner(scripts=True,
-#                   javascript=False,
-#                   comments=False,
-#                   style=False,
-#                   links=False,
-#                   meta=False,
-#                   page_structure=False,
-#                   processing_instructions=False,
-#                   embedded=False,
-#                   frames=False,
-#                   forms=False,
-#                   annoying_tags=False,
-#                   remove_tags=(),
-#                   remove_unknown_tags=False,
-#                   safe_attrs_only=False,
-#                   add_nofollow=False)
-# >>> cleaner.clean_html('<body></body>')
-# '<span></span>'
-# >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
-# '<html><body></body></html>'
-# >>> cleaner.clean_html('<body><div/></body>')
-# '<div></div>'
-# >>> cleaner.clean_html('<html><body><div/><br></body><html>')
-# '<html><body><div></div><br></body></html>'
-# >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
-# '<html><body><div></div><br><span></span></body></html>'
-#
-# using that, we'll miss most actual validation error we want to
-# catch. For now, use dumb regexp
-return REM_SCRIPT_RGX.sub('', data)
 class PageInfo(object):
 """holds various informations on the view's output"""
 def __init__(self, source, root):

changeset 8977	57e564c0118e
parent 8973	6711f78c18be
child 8979	8f5416b1562a