diff -r 058bb3dc685f -r 0b59724cb3f2 cubicweb/devtools/htmlparser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cubicweb/devtools/htmlparser.py Sat Jan 16 13:48:51 2016 +0100 @@ -0,0 +1,314 @@ +# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved. +# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr +# +# This file is part of CubicWeb. +# +# CubicWeb is free software: you can redistribute it and/or modify it under the +# terms of the GNU Lesser General Public License as published by the Free +# Software Foundation, either version 2.1 of the License, or (at your option) +# any later version. +# +# CubicWeb is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +# details. +# +# You should have received a copy of the GNU Lesser General Public License along +# with CubicWeb. If not, see . +"""defines a validating HTML parser used in web application tests""" + +import re +import sys +from xml import sax +from io import BytesIO + +from lxml import etree + +from logilab.common.deprecation import class_deprecated, class_renamed + +from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE + +STRICT_DOCTYPE = str(STRICT_DOCTYPE) +TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE) + +ERR_COUNT = 0 + +_REM_SCRIPT_RGX = re.compile(br"]*>.*?", re.M|re.I|re.S) +def _remove_script_tags(data): + """Remove the script (usually javascript) tags to help the lxml + XMLParser / HTMLParser do their job. Without that, they choke on + tags embedded in JS strings. + """ + # Notice we may want to use lxml cleaner, but it's far too intrusive: + # + # cleaner = Cleaner(scripts=True, + # javascript=False, + # comments=False, + # style=False, + # links=False, + # meta=False, + # page_structure=False, + # processing_instructions=False, + # embedded=False, + # frames=False, + # forms=False, + # annoying_tags=False, + # remove_tags=(), + # remove_unknown_tags=False, + # safe_attrs_only=False, + # add_nofollow=False) + # >>> cleaner.clean_html('') + # '' + # >>> cleaner.clean_html('') + # '' + # >>> cleaner.clean_html('
') + # '
' + # >>> cleaner.clean_html('

') + # '

' + # >>> cleaner.clean_html('

') + # '

' + # + # using that, we'll miss most actual validation error we want to + # catch. For now, use dumb regexp + return _REM_SCRIPT_RGX.sub(b'', data) + + +class Validator(object): + """ base validator API """ + parser = None + + def parse_string(self, source): + etree = self._parse(self.preprocess_data(source)) + return PageInfo(source, etree) + + def preprocess_data(self, data): + return data + + def _parse(self, pdata): + try: + return etree.fromstring(pdata, self.parser) + except etree.XMLSyntaxError as exc: + new_exc = AssertionError(u'invalid document: %s' % exc) + new_exc.position = exc.position + raise new_exc + + +class DTDValidator(Validator): + def __init__(self): + Validator.__init__(self) + # XXX understand what's happening under windows + self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32') + + def preprocess_data(self, data): + """used to fix potential blockquote mess generated by docutils""" + if STRICT_DOCTYPE not in data: + return data + # parse using transitional DTD + data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE) + tree = self._parse(data) + namespace = tree.nsmap.get(None) + # this is the list of authorized child tags for
nodes + expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \ + 'fieldset table form noscript ins del script'.split() + if namespace: + blockquotes = tree.findall('.//{%s}blockquote' % namespace) + expected = ['{%s}%s' % (namespace, tag) for tag in expected] + else: + blockquotes = tree.findall('.//blockquote') + # quick and dirty approach: remove all blockquotes + for blockquote in blockquotes: + parent = blockquote.getparent() + parent.remove(blockquote) + data = etree.tostring(tree) + return '%s\n%s' % ( + STRICT_DOCTYPE, data) + + +class XMLValidator(Validator): + """XML validator, checks that XML is well-formed and used XMLNS are defined""" + + def __init__(self): + Validator.__init__(self) + self.parser = etree.XMLParser() + +SaxOnlyValidator = class_renamed('SaxOnlyValidator', + XMLValidator, + '[3.17] you should use the ' + 'XMLValidator class instead') + + +class XMLSyntaxValidator(Validator): + """XML syntax validator, check XML is well-formed""" + + class MySaxErrorHandler(sax.ErrorHandler): + """override default handler to avoid choking because of unknown entity""" + def fatalError(self, exception): + # XXX check entity in htmlentitydefs + if not str(exception).endswith('undefined entity'): + raise exception + _parser = sax.make_parser() + _parser.setContentHandler(sax.handler.ContentHandler()) + _parser.setErrorHandler(MySaxErrorHandler()) + + def __init__(self): + super(XMLSyntaxValidator, self).__init__() + # XMLParser() wants xml namespaces defined + # XMLParser(recover=True) will accept almost anything + # + # -> use the later but preprocess will check xml well-formness using a + # dumb SAX parser + self.parser = etree.XMLParser(recover=True) + + def preprocess_data(self, data): + return _remove_script_tags(data) + + def _parse(self, data): + inpsrc = sax.InputSource() + inpsrc.setByteStream(BytesIO(data)) + try: + self._parser.parse(inpsrc) + except sax.SAXParseException as exc: + new_exc = AssertionError(u'invalid document: %s' % exc) + new_exc.position = (exc._linenum, exc._colnum) + raise new_exc + return super(XMLSyntaxValidator, self)._parse(data) + + +class HTMLValidator(Validator): + + def __init__(self): + Validator.__init__(self) + self.parser = etree.HTMLParser(recover=False) + + def preprocess_data(self, data): + return _remove_script_tags(data) + + +class PageInfo(object): + """holds various informations on the view's output""" + def __init__(self, source, root): + self.source = source + self.etree = root + self.raw_text = u''.join(root.xpath('//text()')) + self.namespace = self.etree.nsmap + self.default_ns = self.namespace.get(None) + self.a_tags = self.find_tag('a') + self.h1_tags = self.find_tag('h1') + self.h2_tags = self.find_tag('h2') + self.h3_tags = self.find_tag('h3') + self.h4_tags = self.find_tag('h4') + self.input_tags = self.find_tag('input') + self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags] + + def _iterstr(self, tag): + if self.default_ns is None: + return ".//%s" % tag + else: + return ".//{%s}%s" % (self.default_ns, tag) + + def matching_nodes(self, tag, **attrs): + for elt in self.etree.iterfind(self._iterstr(tag)): + eltattrs = elt.attrib + for attr, value in attrs.items(): + try: + if eltattrs[attr] != value: + break + except KeyError: + break + else: # all attributes match + yield elt + + def has_tag(self, tag, nboccurs=1, **attrs): + """returns True if tag with given attributes appears in the page + `nbtimes` (any if None) + """ + for elt in self.matching_nodes(tag, **attrs): + if nboccurs is None: # no need to check number of occurences + return True + if not nboccurs: # too much occurences + return False + nboccurs -= 1 + if nboccurs == 0: # correct number of occurences + return True + return False # no matching tag/attrs + + def find_tag(self, tag, gettext=True): + """return a list which contains text of all "tag" elements """ + iterstr = self._iterstr(tag) + if not gettext or tag in ('a', 'input'): + return [(elt.text, elt.attrib) + for elt in self.etree.iterfind(iterstr)] + return [u''.join(elt.xpath('.//text()')) + for elt in self.etree.iterfind(iterstr)] + + def appears(self, text): + """returns True if appears in the page""" + return text in self.raw_text + + def __contains__(self, text): + return text in self.source + + def has_title(self, text, level=None): + """returns True if text + + :param level: the title's level (1 for h1, 2 for h2, etc.) + """ + if level is None: + for hlist in self.title_tags: + if text in hlist: + return True + return False + else: + hlist = self.title_tags[level - 1] + return text in hlist + + def has_title_regexp(self, pattern, level=None): + """returns True if pattern""" + sre = re.compile(pattern) + if level is None: + for hlist in self.title_tags: + for title in hlist: + if sre.match(title): + return True + return False + else: + hlist = self.title_tags[level - 1] + for title in hlist: + if sre.match(title): + return True + return False + + def has_link(self, text, url=None): + """returns True if text was found in the page""" + for link_text, attrs in self.a_tags: + if text == link_text: + if url is None: + return True + try: + href = attrs['href'] + if href == url: + return True + except KeyError: + continue + return False + + def has_link_regexp(self, pattern, url=None): + """returns True if pattern was found in the page""" + sre = re.compile(pattern) + for link_text, attrs in self.a_tags: + if sre.match(link_text): + if url is None: + return True + try: + href = attrs['href'] + if href == url: + return True + except KeyError: + continue + return False + +VALMAP = {None: None, + 'dtd': DTDValidator, + 'xml': XMLValidator, + 'html': HTMLValidator, + }