--- a/devtools/htmlparser.py Mon Jan 04 18:40:30 2016 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,314 +0,0 @@
-# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
-#
-# This file is part of CubicWeb.
-#
-# CubicWeb is free software: you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with CubicWeb. If not, see <http://www.gnu.org/licenses/>.
-"""defines a validating HTML parser used in web application tests"""
-
-import re
-import sys
-from xml import sax
-from io import BytesIO
-
-from lxml import etree
-
-from logilab.common.deprecation import class_deprecated, class_renamed
-
-from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
-
-STRICT_DOCTYPE = str(STRICT_DOCTYPE)
-TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
-
-ERR_COUNT = 0
-
-_REM_SCRIPT_RGX = re.compile(br"<script[^>]*>.*?</script>", re.M|re.I|re.S)
-def _remove_script_tags(data):
- """Remove the script (usually javascript) tags to help the lxml
- XMLParser / HTMLParser do their job. Without that, they choke on
- tags embedded in JS strings.
- """
- # Notice we may want to use lxml cleaner, but it's far too intrusive:
- #
- # cleaner = Cleaner(scripts=True,
- # javascript=False,
- # comments=False,
- # style=False,
- # links=False,
- # meta=False,
- # page_structure=False,
- # processing_instructions=False,
- # embedded=False,
- # frames=False,
- # forms=False,
- # annoying_tags=False,
- # remove_tags=(),
- # remove_unknown_tags=False,
- # safe_attrs_only=False,
- # add_nofollow=False)
- # >>> cleaner.clean_html('<body></body>')
- # '<span></span>'
- # >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
- # '<html><body></body></html>'
- # >>> cleaner.clean_html('<body><div/></body>')
- # '<div></div>'
- # >>> cleaner.clean_html('<html><body><div/><br></body><html>')
- # '<html><body><div></div><br></body></html>'
- # >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
- # '<html><body><div></div><br><span></span></body></html>'
- #
- # using that, we'll miss most actual validation error we want to
- # catch. For now, use dumb regexp
- return _REM_SCRIPT_RGX.sub(b'', data)
-
-
-class Validator(object):
- """ base validator API """
- parser = None
-
- def parse_string(self, source):
- etree = self._parse(self.preprocess_data(source))
- return PageInfo(source, etree)
-
- def preprocess_data(self, data):
- return data
-
- def _parse(self, pdata):
- try:
- return etree.fromstring(pdata, self.parser)
- except etree.XMLSyntaxError as exc:
- new_exc = AssertionError(u'invalid document: %s' % exc)
- new_exc.position = exc.position
- raise new_exc
-
-
-class DTDValidator(Validator):
- def __init__(self):
- Validator.__init__(self)
- # XXX understand what's happening under windows
- self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32')
-
- def preprocess_data(self, data):
- """used to fix potential blockquote mess generated by docutils"""
- if STRICT_DOCTYPE not in data:
- return data
- # parse using transitional DTD
- data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
- tree = self._parse(data)
- namespace = tree.nsmap.get(None)
- # this is the list of authorized child tags for <blockquote> nodes
- expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
- 'fieldset table form noscript ins del script'.split()
- if namespace:
- blockquotes = tree.findall('.//{%s}blockquote' % namespace)
- expected = ['{%s}%s' % (namespace, tag) for tag in expected]
- else:
- blockquotes = tree.findall('.//blockquote')
- # quick and dirty approach: remove all blockquotes
- for blockquote in blockquotes:
- parent = blockquote.getparent()
- parent.remove(blockquote)
- data = etree.tostring(tree)
- return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
- STRICT_DOCTYPE, data)
-
-
-class XMLValidator(Validator):
- """XML validator, checks that XML is well-formed and used XMLNS are defined"""
-
- def __init__(self):
- Validator.__init__(self)
- self.parser = etree.XMLParser()
-
-SaxOnlyValidator = class_renamed('SaxOnlyValidator',
- XMLValidator,
- '[3.17] you should use the '
- 'XMLValidator class instead')
-
-
-class XMLSyntaxValidator(Validator):
- """XML syntax validator, check XML is well-formed"""
-
- class MySaxErrorHandler(sax.ErrorHandler):
- """override default handler to avoid choking because of unknown entity"""
- def fatalError(self, exception):
- # XXX check entity in htmlentitydefs
- if not str(exception).endswith('undefined entity'):
- raise exception
- _parser = sax.make_parser()
- _parser.setContentHandler(sax.handler.ContentHandler())
- _parser.setErrorHandler(MySaxErrorHandler())
-
- def __init__(self):
- super(XMLSyntaxValidator, self).__init__()
- # XMLParser() wants xml namespaces defined
- # XMLParser(recover=True) will accept almost anything
- #
- # -> use the later but preprocess will check xml well-formness using a
- # dumb SAX parser
- self.parser = etree.XMLParser(recover=True)
-
- def preprocess_data(self, data):
- return _remove_script_tags(data)
-
- def _parse(self, data):
- inpsrc = sax.InputSource()
- inpsrc.setByteStream(BytesIO(data))
- try:
- self._parser.parse(inpsrc)
- except sax.SAXParseException as exc:
- new_exc = AssertionError(u'invalid document: %s' % exc)
- new_exc.position = (exc._linenum, exc._colnum)
- raise new_exc
- return super(XMLSyntaxValidator, self)._parse(data)
-
-
-class HTMLValidator(Validator):
-
- def __init__(self):
- Validator.__init__(self)
- self.parser = etree.HTMLParser(recover=False)
-
- def preprocess_data(self, data):
- return _remove_script_tags(data)
-
-
-class PageInfo(object):
- """holds various informations on the view's output"""
- def __init__(self, source, root):
- self.source = source
- self.etree = root
- self.raw_text = u''.join(root.xpath('//text()'))
- self.namespace = self.etree.nsmap
- self.default_ns = self.namespace.get(None)
- self.a_tags = self.find_tag('a')
- self.h1_tags = self.find_tag('h1')
- self.h2_tags = self.find_tag('h2')
- self.h3_tags = self.find_tag('h3')
- self.h4_tags = self.find_tag('h4')
- self.input_tags = self.find_tag('input')
- self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
-
- def _iterstr(self, tag):
- if self.default_ns is None:
- return ".//%s" % tag
- else:
- return ".//{%s}%s" % (self.default_ns, tag)
-
- def matching_nodes(self, tag, **attrs):
- for elt in self.etree.iterfind(self._iterstr(tag)):
- eltattrs = elt.attrib
- for attr, value in attrs.items():
- try:
- if eltattrs[attr] != value:
- break
- except KeyError:
- break
- else: # all attributes match
- yield elt
-
- def has_tag(self, tag, nboccurs=1, **attrs):
- """returns True if tag with given attributes appears in the page
- `nbtimes` (any if None)
- """
- for elt in self.matching_nodes(tag, **attrs):
- if nboccurs is None: # no need to check number of occurences
- return True
- if not nboccurs: # too much occurences
- return False
- nboccurs -= 1
- if nboccurs == 0: # correct number of occurences
- return True
- return False # no matching tag/attrs
-
- def find_tag(self, tag, gettext=True):
- """return a list which contains text of all "tag" elements """
- iterstr = self._iterstr(tag)
- if not gettext or tag in ('a', 'input'):
- return [(elt.text, elt.attrib)
- for elt in self.etree.iterfind(iterstr)]
- return [u''.join(elt.xpath('.//text()'))
- for elt in self.etree.iterfind(iterstr)]
-
- def appears(self, text):
- """returns True if <text> appears in the page"""
- return text in self.raw_text
-
- def __contains__(self, text):
- return text in self.source
-
- def has_title(self, text, level=None):
- """returns True if <h?>text</h?>
-
- :param level: the title's level (1 for h1, 2 for h2, etc.)
- """
- if level is None:
- for hlist in self.title_tags:
- if text in hlist:
- return True
- return False
- else:
- hlist = self.title_tags[level - 1]
- return text in hlist
-
- def has_title_regexp(self, pattern, level=None):
- """returns True if <h?>pattern</h?>"""
- sre = re.compile(pattern)
- if level is None:
- for hlist in self.title_tags:
- for title in hlist:
- if sre.match(title):
- return True
- return False
- else:
- hlist = self.title_tags[level - 1]
- for title in hlist:
- if sre.match(title):
- return True
- return False
-
- def has_link(self, text, url=None):
- """returns True if <a href=url>text</a> was found in the page"""
- for link_text, attrs in self.a_tags:
- if text == link_text:
- if url is None:
- return True
- try:
- href = attrs['href']
- if href == url:
- return True
- except KeyError:
- continue
- return False
-
- def has_link_regexp(self, pattern, url=None):
- """returns True if <a href=url>pattern</a> was found in the page"""
- sre = re.compile(pattern)
- for link_text, attrs in self.a_tags:
- if sre.match(link_text):
- if url is None:
- return True
- try:
- href = attrs['href']
- if href == url:
- return True
- except KeyError:
- continue
- return False
-
-VALMAP = {None: None,
- 'dtd': DTDValidator,
- 'xml': XMLValidator,
- 'html': HTMLValidator,
- }