# HG changeset patch # User Sylvain Thénault # Date 1368620571 -7200 # Node ID 57e564c0118eadfcf98c97d430caeb56aa53eedc # Parent aeb7d400ee92a51d77114e0beccd7f8f08219cf3 [testlib] introduce a validator that check xml-well formness This validator simply check XML is well formed and accept any entities (think HTML defines much more entities than bare XML) diff -r aeb7d400ee92 -r 57e564c0118e devtools/htmlparser.py --- a/devtools/htmlparser.py Wed May 15 15:37:22 2013 +0200 +++ b/devtools/htmlparser.py Wed May 15 14:22:51 2013 +0200 @@ -19,6 +19,8 @@ import re import sys +from xml import sax +from cStringIO import StringIO from lxml import etree @@ -31,7 +33,48 @@ ERR_COUNT = 0 +_REM_SCRIPT_RGX = re.compile(r"]*>.*?", re.U|re.M|re.I|re.S) +def _remove_script_tags(data): + """Remove the script (usually javascript) tags to help the lxml + XMLParser / HTMLParser do their job. Without that, they choke on + tags embedded in JS strings. + """ + # Notice we may want to use lxml cleaner, but it's far too intrusive: + # + # cleaner = Cleaner(scripts=True, + # javascript=False, + # comments=False, + # style=False, + # links=False, + # meta=False, + # page_structure=False, + # processing_instructions=False, + # embedded=False, + # frames=False, + # forms=False, + # annoying_tags=False, + # remove_tags=(), + # remove_unknown_tags=False, + # safe_attrs_only=False, + # add_nofollow=False) + # >>> cleaner.clean_html('') + # '' + # >>> cleaner.clean_html('') + # '' + # >>> cleaner.clean_html('
') + # '
' + # >>> cleaner.clean_html('

') + # '

' + # >>> cleaner.clean_html('

') + # '

' + # + # using that, we'll miss most actual validation error we want to + # catch. For now, use dumb regexp + return _REM_SCRIPT_RGX.sub('', data) + + class Validator(object): + """ base validator API """ parser = None def parse_string(self, source): @@ -84,7 +127,7 @@ class XMLValidator(Validator): - """ A fully compliant XML parser """ + """XML validator, checks that XML is well-formed and used XMLNS are defined""" def __init__(self): Validator.__init__(self) @@ -95,6 +138,44 @@ '[3.17] you should use the ' 'XMLValidator class instead') + +class XMLSyntaxValidator(Validator): + """XML syntax validator, check XML is well-formed""" + + class MySaxErrorHandler(sax.ErrorHandler): + """override default handler to avoid choking because of unknown entity""" + def fatalError(self, exception): + # XXX check entity in htmlentitydefs + if not str(exception).endswith('undefined entity'): + raise exception + _parser = sax.make_parser() + _parser.setContentHandler(sax.handler.ContentHandler()) + _parser.setErrorHandler(MySaxErrorHandler()) + + def __init__(self): + super(XMLSyntaxValidator, self).__init__() + # XMLParser() wants xml namespaces defined + # XMLParser(recover=True) will accept almost anything + # + # -> use the later but preprocess will check xml well-formness using a + # dumb SAX parser + self.parser = etree.XMLParser(recover=True) + + def preprocess_data(self, data): + return _remove_script_tags(data) + + def _parse(self, data): + inpsrc = sax.InputSource() + inpsrc.setByteStream(StringIO(data)) + try: + self._parser.parse(inpsrc) + except sax.SAXParseException, exc: + new_exc = AssertionError(u'invalid document: %s' % exc) + new_exc.position = (exc._linenum, exc._colnum) + raise new_exc + return super(XMLSyntaxValidator, self)._parse(data) + + class XMLDemotingValidator(XMLValidator): """ some views produce html instead of xhtml, using demote_to_html @@ -112,8 +193,6 @@ return data -REM_SCRIPT_RGX = re.compile(r"]*>.*?", re.U|re.M|re.I|re.S) - class HTMLValidator(Validator): def __init__(self): @@ -121,41 +200,7 @@ self.parser = etree.HTMLParser(recover=False) def preprocess_data(self, data): - """ Here we essentially wipe the javascript tags to help the HTMLParser - do its job. Without that, it chokes on tags embedded in JS strings. - """ - # Notice we may want to use lxml cleaner, but it's far too intrusive: - # - # cleaner = Cleaner(scripts=True, - # javascript=False, - # comments=False, - # style=False, - # links=False, - # meta=False, - # page_structure=False, - # processing_instructions=False, - # embedded=False, - # frames=False, - # forms=False, - # annoying_tags=False, - # remove_tags=(), - # remove_unknown_tags=False, - # safe_attrs_only=False, - # add_nofollow=False) - # >>> cleaner.clean_html('') - # '' - # >>> cleaner.clean_html('') - # '' - # >>> cleaner.clean_html('
') - # '
' - # >>> cleaner.clean_html('

') - # '

' - # >>> cleaner.clean_html('

') - # '

' - # - # using that, we'll miss most actual validation error we want to - # catch. For now, use dumb regexp - return REM_SCRIPT_RGX.sub('', data) + return _remove_script_tags(data) class PageInfo(object):