--- a/devtools/htmlparser.py Fri Jun 14 16:13:24 2013 +0200
+++ b/devtools/htmlparser.py Fri Jun 14 16:26:25 2013 +0200
@@ -1,4 +1,4 @@
-# copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This file is part of CubicWeb.
@@ -19,10 +19,12 @@
import re
import sys
+from xml import sax
+from cStringIO import StringIO
from lxml import etree
-from logilab.common.deprecation import class_deprecated
+from logilab.common.deprecation import class_deprecated, class_renamed
from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
@@ -31,22 +33,67 @@
ERR_COUNT = 0
-class Validator(object):
+_REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
+def _remove_script_tags(data):
+ """Remove the script (usually javascript) tags to help the lxml
+ XMLParser / HTMLParser do their job. Without that, they choke on
+ tags embedded in JS strings.
+ """
+ # Notice we may want to use lxml cleaner, but it's far too intrusive:
+ #
+ # cleaner = Cleaner(scripts=True,
+ # javascript=False,
+ # comments=False,
+ # style=False,
+ # links=False,
+ # meta=False,
+ # page_structure=False,
+ # processing_instructions=False,
+ # embedded=False,
+ # frames=False,
+ # forms=False,
+ # annoying_tags=False,
+ # remove_tags=(),
+ # remove_unknown_tags=False,
+ # safe_attrs_only=False,
+ # add_nofollow=False)
+ # >>> cleaner.clean_html('<body></body>')
+ # '<span></span>'
+ # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
+ # '<html><body></body></html>'
+ # >>> cleaner.clean_html('<body><div/></body>')
+ # '<div></div>'
+ # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
+ # '<html><body><div></div><br></body></html>'
+ # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
+ # '<html><body><div></div><br><span></span></body></html>'
+ #
+ # using that, we'll miss most actual validation error we want to
+ # catch. For now, use dumb regexp
+ return _REM_SCRIPT_RGX.sub('', data)
- def parse_string(self, data, sysid=None):
+
+class Validator(object):
+ """ base validator API """
+ parser = None
+
+ def parse_string(self, source):
+ etree = self._parse(self.preprocess_data(source))
+ return PageInfo(source, etree)
+
+ def preprocess_data(self, data):
+ return data
+
+ def _parse(self, pdata):
try:
- data = self.preprocess_data(data)
- return PageInfo(data, etree.fromstring(data, self.parser))
+ return etree.fromstring(pdata, self.parser)
except etree.XMLSyntaxError as exc:
def save_in(fname=''):
file(fname, 'w').write(data)
- new_exc = AssertionError(u'invalid xml %s' % exc)
+ new_exc = AssertionError(u'invalid document: %s' % exc)
new_exc.position = exc.position
raise new_exc
- def preprocess_data(self, data):
- return data
-
class DTDValidator(Validator):
def __init__(self):
@@ -60,7 +107,7 @@
return data
# parse using transitional DTD
data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
- tree = etree.fromstring(data, self.parser)
+ tree = self._parse(data)
namespace = tree.nsmap.get(None)
# this is the list of authorized child tags for <blockquote> nodes
expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
@@ -79,20 +126,64 @@
STRICT_DOCTYPE, data)
-class SaxOnlyValidator(Validator):
+class XMLValidator(Validator):
+ """XML validator, checks that XML is well-formed and used XMLNS are defined"""
def __init__(self):
Validator.__init__(self)
self.parser = etree.XMLParser()
+SaxOnlyValidator = class_renamed('SaxOnlyValidator',
+ XMLValidator,
+ '[3.17] you should use the '
+ 'XMLValidator class instead')
-class XMLDemotingValidator(SaxOnlyValidator):
+
+class XMLSyntaxValidator(Validator):
+ """XML syntax validator, check XML is well-formed"""
+
+ class MySaxErrorHandler(sax.ErrorHandler):
+ """override default handler to avoid choking because of unknown entity"""
+ def fatalError(self, exception):
+ # XXX check entity in htmlentitydefs
+ if not str(exception).endswith('undefined entity'):
+ raise exception
+ _parser = sax.make_parser()
+ _parser.setContentHandler(sax.handler.ContentHandler())
+ _parser.setErrorHandler(MySaxErrorHandler())
+
+ def __init__(self):
+ super(XMLSyntaxValidator, self).__init__()
+ # XMLParser() wants xml namespaces defined
+ # XMLParser(recover=True) will accept almost anything
+ #
+ # -> use the later but preprocess will check xml well-formness using a
+ # dumb SAX parser
+ self.parser = etree.XMLParser(recover=True)
+
+ def preprocess_data(self, data):
+ return _remove_script_tags(data)
+
+ def _parse(self, data):
+ inpsrc = sax.InputSource()
+ inpsrc.setByteStream(StringIO(data))
+ try:
+ self._parser.parse(inpsrc)
+ except sax.SAXParseException, exc:
+ new_exc = AssertionError(u'invalid document: %s' % exc)
+ new_exc.position = (exc._linenum, exc._colnum)
+ raise new_exc
+ return super(XMLSyntaxValidator, self)._parse(data)
+
+
+class XMLDemotingValidator(XMLValidator):
""" some views produce html instead of xhtml, using demote_to_html
this is typically related to the use of external dependencies
which do not produce valid xhtml (google maps, ...)
"""
__metaclass__ = class_deprecated
+ __deprecation_warning__ = '[3.10] this is now handled in testlib.py'
def preprocess_data(self, data):
if data.startswith('<?xml'):
@@ -106,8 +197,10 @@
def __init__(self):
Validator.__init__(self)
- self.parser = etree.HTMLParser()
+ self.parser = etree.HTMLParser(recover=False)
+ def preprocess_data(self, data):
+ return _remove_script_tags(data)
class PageInfo(object):
@@ -115,7 +208,6 @@
def __init__(self, source, root):
self.source = source
self.etree = root
- self.source = source
self.raw_text = u''.join(root.xpath('//text()'))
self.namespace = self.etree.nsmap
self.default_ns = self.namespace.get(None)
@@ -234,4 +326,8 @@
continue
return False
-VALMAP = {None: None, 'dtd': DTDValidator, 'xml': SaxOnlyValidator}
+VALMAP = {None: None,
+ 'dtd': DTDValidator,
+ 'xml': XMLValidator,
+ 'html': HTMLValidator,
+ }