diff -r 058bb3dc685f -r 0b59724cb3f2 devtools/htmlparser.py --- a/devtools/htmlparser.py Mon Jan 04 18:40:30 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,314 +0,0 @@ -# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved. -# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr -# -# This file is part of CubicWeb. -# -# CubicWeb is free software: you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License as published by the Free -# Software Foundation, either version 2.1 of the License, or (at your option) -# any later version. -# -# CubicWeb is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -# details. -# -# You should have received a copy of the GNU Lesser General Public License along -# with CubicWeb. If not, see . -"""defines a validating HTML parser used in web application tests""" - -import re -import sys -from xml import sax -from io import BytesIO - -from lxml import etree - -from logilab.common.deprecation import class_deprecated, class_renamed - -from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE - -STRICT_DOCTYPE = str(STRICT_DOCTYPE) -TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) - -ERR_COUNT = 0 - -_REM_SCRIPT_RGX = re.compile(br"]*>.*?", re.M|re.I|re.S) -def _remove_script_tags(data): - """Remove the script (usually javascript) tags to help the lxml - XMLParser / HTMLParser do their job. Without that, they choke on - tags embedded in JS strings. - """ - # Notice we may want to use lxml cleaner, but it's far too intrusive: - # - # cleaner = Cleaner(scripts=True, - # javascript=False, - # comments=False, - # style=False, - # links=False, - # meta=False, - # page_structure=False, - # processing_instructions=False, - # embedded=False, - # frames=False, - # forms=False, - # annoying_tags=False, - # remove_tags=(), - # remove_unknown_tags=False, - # safe_attrs_only=False, - # add_nofollow=False) - # >>> cleaner.clean_html('') - # '' - # >>> cleaner.clean_html('') - # '' - # >>> cleaner.clean_html('
') - # '
' - # >>> cleaner.clean_html('

') - # '

' - # >>> cleaner.clean_html('

') - # '

' - # - # using that, we'll miss most actual validation error we want to - # catch. For now, use dumb regexp - return _REM_SCRIPT_RGX.sub(b'', data) - - -class Validator(object): - """ base validator API """ - parser = None - - def parse_string(self, source): - etree = self._parse(self.preprocess_data(source)) - return PageInfo(source, etree) - - def preprocess_data(self, data): - return data - - def _parse(self, pdata): - try: - return etree.fromstring(pdata, self.parser) - except etree.XMLSyntaxError as exc: - new_exc = AssertionError(u'invalid document: %s' % exc) - new_exc.position = exc.position - raise new_exc - - -class DTDValidator(Validator): - def __init__(self): - Validator.__init__(self) - # XXX understand what's happening under windows - self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32') - - def preprocess_data(self, data): - """used to fix potential blockquote mess generated by docutils""" - if STRICT_DOCTYPE not in data: - return data - # parse using transitional DTD - data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) - tree = self._parse(data) - namespace = tree.nsmap.get(None) - # this is the list of authorized child tags for
nodes - expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ - 'fieldset table form noscript ins del script'.split() - if namespace: - blockquotes = tree.findall('.//{%s}blockquote' % namespace) - expected = ['{%s}%s' % (namespace, tag) for tag in expected] - else: - blockquotes = tree.findall('.//blockquote') - # quick and dirty approach: remove all blockquotes - for blockquote in blockquotes: - parent = blockquote.getparent() - parent.remove(blockquote) - data = etree.tostring(tree) - return '%s\n%s' % ( - STRICT_DOCTYPE, data) - - -class XMLValidator(Validator): - """XML validator, checks that XML is well-formed and used XMLNS are defined""" - - def __init__(self): - Validator.__init__(self) - self.parser = etree.XMLParser() - -SaxOnlyValidator = class_renamed('SaxOnlyValidator', - XMLValidator, - '[3.17] you should use the ' - 'XMLValidator class instead') - - -class XMLSyntaxValidator(Validator): - """XML syntax validator, check XML is well-formed""" - - class MySaxErrorHandler(sax.ErrorHandler): - """override default handler to avoid choking because of unknown entity""" - def fatalError(self, exception): - # XXX check entity in htmlentitydefs - if not str(exception).endswith('undefined entity'): - raise exception - _parser = sax.make_parser() - _parser.setContentHandler(sax.handler.ContentHandler()) - _parser.setErrorHandler(MySaxErrorHandler()) - - def __init__(self): - super(XMLSyntaxValidator, self).__init__() - # XMLParser() wants xml namespaces defined - # XMLParser(recover=True) will accept almost anything - # - # -> use the later but preprocess will check xml well-formness using a - # dumb SAX parser - self.parser = etree.XMLParser(recover=True) - - def preprocess_data(self, data): - return _remove_script_tags(data) - - def _parse(self, data): - inpsrc = sax.InputSource() - inpsrc.setByteStream(BytesIO(data)) - try: - self._parser.parse(inpsrc) - except sax.SAXParseException as exc: - new_exc = AssertionError(u'invalid document: %s' % exc) - new_exc.position = (exc._linenum, exc._colnum) - raise new_exc - return super(XMLSyntaxValidator, self)._parse(data) - - -class HTMLValidator(Validator): - - def __init__(self): - Validator.__init__(self) - self.parser = etree.HTMLParser(recover=False) - - def preprocess_data(self, data): - return _remove_script_tags(data) - - -class PageInfo(object): - """holds various informations on the view's output""" - def __init__(self, source, root): - self.source = source - self.etree = root - self.raw_text = u''.join(root.xpath('//text()')) - self.namespace = self.etree.nsmap - self.default_ns = self.namespace.get(None) - self.a_tags = self.find_tag('a') - self.h1_tags = self.find_tag('h1') - self.h2_tags = self.find_tag('h2') - self.h3_tags = self.find_tag('h3') - self.h4_tags = self.find_tag('h4') - self.input_tags = self.find_tag('input') - self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] - - def _iterstr(self, tag): - if self.default_ns is None: - return ".//%s" % tag - else: - return ".//{%s}%s" % (self.default_ns, tag) - - def matching_nodes(self, tag, **attrs): - for elt in self.etree.iterfind(self._iterstr(tag)): - eltattrs = elt.attrib - for attr, value in attrs.items(): - try: - if eltattrs[attr] != value: - break - except KeyError: - break - else: # all attributes match - yield elt - - def has_tag(self, tag, nboccurs=1, **attrs): - """returns True if tag with given attributes appears in the page - `nbtimes` (any if None) - """ - for elt in self.matching_nodes(tag, **attrs): - if nboccurs is None: # no need to check number of occurences - return True - if not nboccurs: # too much occurences - return False - nboccurs -= 1 - if nboccurs == 0: # correct number of occurences - return True - return False # no matching tag/attrs - - def find_tag(self, tag, gettext=True): - """return a list which contains text of all "tag" elements """ - iterstr = self._iterstr(tag) - if not gettext or tag in ('a', 'input'): - return [(elt.text, elt.attrib) - for elt in self.etree.iterfind(iterstr)] - return [u''.join(elt.xpath('.//text()')) - for elt in self.etree.iterfind(iterstr)] - - def appears(self, text): - """returns True if appears in the page""" - return text in self.raw_text - - def __contains__(self, text): - return text in self.source - - def has_title(self, text, level=None): - """returns True if text - - :param level: the title's level (1 for h1, 2 for h2, etc.) - """ - if level is None: - for hlist in self.title_tags: - if text in hlist: - return True - return False - else: - hlist = self.title_tags[level - 1] - return text in hlist - - def has_title_regexp(self, pattern, level=None): - """returns True if pattern""" - sre = re.compile(pattern) - if level is None: - for hlist in self.title_tags: - for title in hlist: - if sre.match(title): - return True - return False - else: - hlist = self.title_tags[level - 1] - for title in hlist: - if sre.match(title): - return True - return False - - def has_link(self, text, url=None): - """returns True if text was found in the page""" - for link_text, attrs in self.a_tags: - if text == link_text: - if url is None: - return True - try: - href = attrs['href'] - if href == url: - return True - except KeyError: - continue - return False - - def has_link_regexp(self, pattern, url=None): - """returns True if pattern was found in the page""" - sre = re.compile(pattern) - for link_text, attrs in self.a_tags: - if sre.match(link_text): - if url is None: - return True - try: - href = attrs['href'] - if href == url: - return True - except KeyError: - continue - return False - -VALMAP = {None: None, - 'dtd': DTDValidator, - 'xml': XMLValidator, - 'html': HTMLValidator, - }