--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cubicweb/devtools/htmlparser.py Sat Jan 16 13:48:51 2016 +0100
@@ -0,0 +1,314 @@
+# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
+#
+# This file is part of CubicWeb.
+#
+# CubicWeb is free software: you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with CubicWeb. If not, see <http://www.gnu.org/licenses/>.
+"""defines a validating HTML parser used in web application tests"""
+
+import re
+import sys
+from xml import sax
+from io import BytesIO
+
+from lxml import etree
+
+from logilab.common.deprecation import class_deprecated, class_renamed
+
+from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
+
+STRICT_DOCTYPE = str(STRICT_DOCTYPE)
+TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
+
+ERR_COUNT = 0
+
+_REM_SCRIPT_RGX = re.compile(br"<script[^>]*>.*?</script>", re.M|re.I|re.S)
+def _remove_script_tags(data):
+ """Remove the script (usually javascript) tags to help the lxml
+ XMLParser / HTMLParser do their job. Without that, they choke on
+ tags embedded in JS strings.
+ """
+ # Notice we may want to use lxml cleaner, but it's far too intrusive:
+ #
+ # cleaner = Cleaner(scripts=True,
+ # javascript=False,
+ # comments=False,
+ # style=False,
+ # links=False,
+ # meta=False,
+ # page_structure=False,
+ # processing_instructions=False,
+ # embedded=False,
+ # frames=False,
+ # forms=False,
+ # annoying_tags=False,
+ # remove_tags=(),
+ # remove_unknown_tags=False,
+ # safe_attrs_only=False,
+ # add_nofollow=False)
+ # >>> cleaner.clean_html('<body></body>')
+ # '<span></span>'
+ # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
+ # '<html><body></body></html>'
+ # >>> cleaner.clean_html('<body><div/></body>')
+ # '<div></div>'
+ # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
+ # '<html><body><div></div><br></body></html>'
+ # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
+ # '<html><body><div></div><br><span></span></body></html>'
+ #
+ # using that, we'll miss most actual validation error we want to
+ # catch. For now, use dumb regexp
+ return _REM_SCRIPT_RGX.sub(b'', data)
+
+
+class Validator(object):
+ """ base validator API """
+ parser = None
+
+ def parse_string(self, source):
+ etree = self._parse(self.preprocess_data(source))
+ return PageInfo(source, etree)
+
+ def preprocess_data(self, data):
+ return data
+
+ def _parse(self, pdata):
+ try:
+ return etree.fromstring(pdata, self.parser)
+ except etree.XMLSyntaxError as exc:
+ new_exc = AssertionError(u'invalid document: %s' % exc)
+ new_exc.position = exc.position
+ raise new_exc
+
+
+class DTDValidator(Validator):
+ def __init__(self):
+ Validator.__init__(self)
+ # XXX understand what's happening under windows
+ self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32')
+
+ def preprocess_data(self, data):
+ """used to fix potential blockquote mess generated by docutils"""
+ if STRICT_DOCTYPE not in data:
+ return data
+ # parse using transitional DTD
+ data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
+ tree = self._parse(data)
+ namespace = tree.nsmap.get(None)
+ # this is the list of authorized child tags for <blockquote> nodes
+ expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
+ 'fieldset table form noscript ins del script'.split()
+ if namespace:
+ blockquotes = tree.findall('.//{%s}blockquote' % namespace)
+ expected = ['{%s}%s' % (namespace, tag) for tag in expected]
+ else:
+ blockquotes = tree.findall('.//blockquote')
+ # quick and dirty approach: remove all blockquotes
+ for blockquote in blockquotes:
+ parent = blockquote.getparent()
+ parent.remove(blockquote)
+ data = etree.tostring(tree)
+ return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
+ STRICT_DOCTYPE, data)
+
+
+class XMLValidator(Validator):
+ """XML validator, checks that XML is well-formed and used XMLNS are defined"""
+
+ def __init__(self):
+ Validator.__init__(self)
+ self.parser = etree.XMLParser()
+
+SaxOnlyValidator = class_renamed('SaxOnlyValidator',
+ XMLValidator,
+ '[3.17] you should use the '
+ 'XMLValidator class instead')
+
+
+class XMLSyntaxValidator(Validator):
+ """XML syntax validator, check XML is well-formed"""
+
+ class MySaxErrorHandler(sax.ErrorHandler):
+ """override default handler to avoid choking because of unknown entity"""
+ def fatalError(self, exception):
+ # XXX check entity in htmlentitydefs
+ if not str(exception).endswith('undefined entity'):
+ raise exception
+ _parser = sax.make_parser()
+ _parser.setContentHandler(sax.handler.ContentHandler())
+ _parser.setErrorHandler(MySaxErrorHandler())
+
+ def __init__(self):
+ super(XMLSyntaxValidator, self).__init__()
+ # XMLParser() wants xml namespaces defined
+ # XMLParser(recover=True) will accept almost anything
+ #
+ # -> use the later but preprocess will check xml well-formness using a
+ # dumb SAX parser
+ self.parser = etree.XMLParser(recover=True)
+
+ def preprocess_data(self, data):
+ return _remove_script_tags(data)
+
+ def _parse(self, data):
+ inpsrc = sax.InputSource()
+ inpsrc.setByteStream(BytesIO(data))
+ try:
+ self._parser.parse(inpsrc)
+ except sax.SAXParseException as exc:
+ new_exc = AssertionError(u'invalid document: %s' % exc)
+ new_exc.position = (exc._linenum, exc._colnum)
+ raise new_exc
+ return super(XMLSyntaxValidator, self)._parse(data)
+
+
+class HTMLValidator(Validator):
+
+ def __init__(self):
+ Validator.__init__(self)
+ self.parser = etree.HTMLParser(recover=False)
+
+ def preprocess_data(self, data):
+ return _remove_script_tags(data)
+
+
+class PageInfo(object):
+ """holds various informations on the view's output"""
+ def __init__(self, source, root):
+ self.source = source
+ self.etree = root
+ self.raw_text = u''.join(root.xpath('//text()'))
+ self.namespace = self.etree.nsmap
+ self.default_ns = self.namespace.get(None)
+ self.a_tags = self.find_tag('a')
+ self.h1_tags = self.find_tag('h1')
+ self.h2_tags = self.find_tag('h2')
+ self.h3_tags = self.find_tag('h3')
+ self.h4_tags = self.find_tag('h4')
+ self.input_tags = self.find_tag('input')
+ self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
+
+ def _iterstr(self, tag):
+ if self.default_ns is None:
+ return ".//%s" % tag
+ else:
+ return ".//{%s}%s" % (self.default_ns, tag)
+
+ def matching_nodes(self, tag, **attrs):
+ for elt in self.etree.iterfind(self._iterstr(tag)):
+ eltattrs = elt.attrib
+ for attr, value in attrs.items():
+ try:
+ if eltattrs[attr] != value:
+ break
+ except KeyError:
+ break
+ else: # all attributes match
+ yield elt
+
+ def has_tag(self, tag, nboccurs=1, **attrs):
+ """returns True if tag with given attributes appears in the page
+ `nbtimes` (any if None)
+ """
+ for elt in self.matching_nodes(tag, **attrs):
+ if nboccurs is None: # no need to check number of occurences
+ return True
+ if not nboccurs: # too much occurences
+ return False
+ nboccurs -= 1
+ if nboccurs == 0: # correct number of occurences
+ return True
+ return False # no matching tag/attrs
+
+ def find_tag(self, tag, gettext=True):
+ """return a list which contains text of all "tag" elements """
+ iterstr = self._iterstr(tag)
+ if not gettext or tag in ('a', 'input'):
+ return [(elt.text, elt.attrib)
+ for elt in self.etree.iterfind(iterstr)]
+ return [u''.join(elt.xpath('.//text()'))
+ for elt in self.etree.iterfind(iterstr)]
+
+ def appears(self, text):
+ """returns True if <text> appears in the page"""
+ return text in self.raw_text
+
+ def __contains__(self, text):
+ return text in self.source
+
+ def has_title(self, text, level=None):
+ """returns True if <h?>text</h?>
+
+ :param level: the title's level (1 for h1, 2 for h2, etc.)
+ """
+ if level is None:
+ for hlist in self.title_tags:
+ if text in hlist:
+ return True
+ return False
+ else:
+ hlist = self.title_tags[level - 1]
+ return text in hlist
+
+ def has_title_regexp(self, pattern, level=None):
+ """returns True if <h?>pattern</h?>"""
+ sre = re.compile(pattern)
+ if level is None:
+ for hlist in self.title_tags:
+ for title in hlist:
+ if sre.match(title):
+ return True
+ return False
+ else:
+ hlist = self.title_tags[level - 1]
+ for title in hlist:
+ if sre.match(title):
+ return True
+ return False
+
+ def has_link(self, text, url=None):
+ """returns True if <a href=url>text</a> was found in the page"""
+ for link_text, attrs in self.a_tags:
+ if text == link_text:
+ if url is None:
+ return True
+ try:
+ href = attrs['href']
+ if href == url:
+ return True
+ except KeyError:
+ continue
+ return False
+
+ def has_link_regexp(self, pattern, url=None):
+ """returns True if <a href=url>pattern</a> was found in the page"""
+ sre = re.compile(pattern)
+ for link_text, attrs in self.a_tags:
+ if sre.match(link_text):
+ if url is None:
+ return True
+ try:
+ href = attrs['href']
+ if href == url:
+ return True
+ except KeyError:
+ continue
+ return False
+
+VALMAP = {None: None,
+ 'dtd': DTDValidator,
+ 'xml': XMLValidator,
+ 'html': HTMLValidator,
+ }