# copyright 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This file is part of CubicWeb.
#
# CubicWeb is free software: you can redistribute it and/or modify it under the
# terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with CubicWeb. If not, see <http://www.gnu.org/licenses/>.
"""defines a validating HTML parser used in web application tests"""
import re
import sys
from xml import sax
from cStringIO import StringIO
from lxml import etree
from logilab.common.deprecation import class_deprecated, class_renamed
from cubicweb.view import STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE
STRICT_DOCTYPE = str(STRICT_DOCTYPE)
TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
ERR_COUNT = 0
_REM_SCRIPT_RGX = re.compile(r"<script[^>]*>.*?</script>", re.U|re.M|re.I|re.S)
def _remove_script_tags(data):
"""Remove the script (usually javascript) tags to help the lxml
XMLParser / HTMLParser do their job. Without that, they choke on
tags embedded in JS strings.
"""
# Notice we may want to use lxml cleaner, but it's far too intrusive:
#
# cleaner = Cleaner(scripts=True,
# javascript=False,
# comments=False,
# style=False,
# links=False,
# meta=False,
# page_structure=False,
# processing_instructions=False,
# embedded=False,
# frames=False,
# forms=False,
# annoying_tags=False,
# remove_tags=(),
# remove_unknown_tags=False,
# safe_attrs_only=False,
# add_nofollow=False)
# >>> cleaner.clean_html('<body></body>')
# '<span></span>'
# >>> cleaner.clean_html('<!DOCTYPE html><body></body>')
# '<html><body></body></html>'
# >>> cleaner.clean_html('<body><div/></body>')
# '<div></div>'
# >>> cleaner.clean_html('<html><body><div/><br></body><html>')
# '<html><body><div></div><br></body></html>'
# >>> cleaner.clean_html('<html><body><div/><br><span></body><html>')
# '<html><body><div></div><br><span></span></body></html>'
#
# using that, we'll miss most actual validation error we want to
# catch. For now, use dumb regexp
return _REM_SCRIPT_RGX.sub('', data)
class Validator(object):
""" base validator API """
parser = None
def parse_string(self, source):
etree = self._parse(self.preprocess_data(source))
return PageInfo(source, etree)
def preprocess_data(self, data):
return data
def _parse(self, pdata):
try:
return etree.fromstring(pdata, self.parser)
except etree.XMLSyntaxError as exc:
new_exc = AssertionError(u'invalid document: %s' % exc)
new_exc.position = exc.position
raise new_exc
class DTDValidator(Validator):
def __init__(self):
Validator.__init__(self)
# XXX understand what's happening under windows
self.parser = etree.XMLParser(dtd_validation=sys.platform != 'win32')
def preprocess_data(self, data):
"""used to fix potential blockquote mess generated by docutils"""
if STRICT_DOCTYPE not in data:
return data
# parse using transitional DTD
data = data.replace(STRICT_DOCTYPE, TRANSITIONAL_DOCTYPE)
tree = self._parse(data)
namespace = tree.nsmap.get(None)
# this is the list of authorized child tags for <blockquote> nodes
expected = 'p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \
'fieldset table form noscript ins del script'.split()
if namespace:
blockquotes = tree.findall('.//{%s}blockquote' % namespace)
expected = ['{%s}%s' % (namespace, tag) for tag in expected]
else:
blockquotes = tree.findall('.//blockquote')
# quick and dirty approach: remove all blockquotes
for blockquote in blockquotes:
parent = blockquote.getparent()
parent.remove(blockquote)
data = etree.tostring(tree)
return '<?xml version="1.0" encoding="UTF-8"?>%s\n%s' % (
STRICT_DOCTYPE, data)
class XMLValidator(Validator):
"""XML validator, checks that XML is well-formed and used XMLNS are defined"""
def __init__(self):
Validator.__init__(self)
self.parser = etree.XMLParser()
SaxOnlyValidator = class_renamed('SaxOnlyValidator',
XMLValidator,
'[3.17] you should use the '
'XMLValidator class instead')
class XMLSyntaxValidator(Validator):
"""XML syntax validator, check XML is well-formed"""
class MySaxErrorHandler(sax.ErrorHandler):
"""override default handler to avoid choking because of unknown entity"""
def fatalError(self, exception):
# XXX check entity in htmlentitydefs
if not str(exception).endswith('undefined entity'):
raise exception
_parser = sax.make_parser()
_parser.setContentHandler(sax.handler.ContentHandler())
_parser.setErrorHandler(MySaxErrorHandler())
def __init__(self):
super(XMLSyntaxValidator, self).__init__()
# XMLParser() wants xml namespaces defined
# XMLParser(recover=True) will accept almost anything
#
# -> use the later but preprocess will check xml well-formness using a
# dumb SAX parser
self.parser = etree.XMLParser(recover=True)
def preprocess_data(self, data):
return _remove_script_tags(data)
def _parse(self, data):
inpsrc = sax.InputSource()
inpsrc.setByteStream(StringIO(data))
try:
self._parser.parse(inpsrc)
except sax.SAXParseException, exc:
new_exc = AssertionError(u'invalid document: %s' % exc)
new_exc.position = (exc._linenum, exc._colnum)
raise new_exc
return super(XMLSyntaxValidator, self)._parse(data)
class HTMLValidator(Validator):
def __init__(self):
Validator.__init__(self)
self.parser = etree.HTMLParser(recover=False)
def preprocess_data(self, data):
return _remove_script_tags(data)
class PageInfo(object):
"""holds various informations on the view's output"""
def __init__(self, source, root):
self.source = source
self.etree = root
self.raw_text = u''.join(root.xpath('//text()'))
self.namespace = self.etree.nsmap
self.default_ns = self.namespace.get(None)
self.a_tags = self.find_tag('a')
self.h1_tags = self.find_tag('h1')
self.h2_tags = self.find_tag('h2')
self.h3_tags = self.find_tag('h3')
self.h4_tags = self.find_tag('h4')
self.input_tags = self.find_tag('input')
self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
def _iterstr(self, tag):
if self.default_ns is None:
return ".//%s" % tag
else:
return ".//{%s}%s" % (self.default_ns, tag)
def matching_nodes(self, tag, **attrs):
for elt in self.etree.iterfind(self._iterstr(tag)):
eltattrs = elt.attrib
for attr, value in attrs.iteritems():
try:
if eltattrs[attr] != value:
break
except KeyError:
break
else: # all attributes match
yield elt
def has_tag(self, tag, nboccurs=1, **attrs):
"""returns True if tag with given attributes appears in the page
`nbtimes` (any if None)
"""
for elt in self.matching_nodes(tag, **attrs):
if nboccurs is None: # no need to check number of occurences
return True
if not nboccurs: # too much occurences
return False
nboccurs -= 1
if nboccurs == 0: # correct number of occurences
return True
return False # no matching tag/attrs
def find_tag(self, tag, gettext=True):
"""return a list which contains text of all "tag" elements """
iterstr = self._iterstr(tag)
if not gettext or tag in ('a', 'input'):
return [(elt.text, elt.attrib)
for elt in self.etree.iterfind(iterstr)]
return [u''.join(elt.xpath('.//text()'))
for elt in self.etree.iterfind(iterstr)]
def appears(self, text):
"""returns True if <text> appears in the page"""
return text in self.raw_text
def __contains__(self, text):
return text in self.source
def has_title(self, text, level=None):
"""returns True if <h?>text</h?>
:param level: the title's level (1 for h1, 2 for h2, etc.)
"""
if level is None:
for hlist in self.title_tags:
if text in hlist:
return True
return False
else:
hlist = self.title_tags[level - 1]
return text in hlist
def has_title_regexp(self, pattern, level=None):
"""returns True if <h?>pattern</h?>"""
sre = re.compile(pattern)
if level is None:
for hlist in self.title_tags:
for title in hlist:
if sre.match(title):
return True
return False
else:
hlist = self.title_tags[level - 1]
for title in hlist:
if sre.match(title):
return True
return False
def has_link(self, text, url=None):
"""returns True if <a href=url>text</a> was found in the page"""
for link_text, attrs in self.a_tags:
if text == link_text:
if url is None:
return True
try:
href = attrs['href']
if href == url:
return True
except KeyError:
continue
return False
def has_link_regexp(self, pattern, url=None):
"""returns True if <a href=url>pattern</a> was found in the page"""
sre = re.compile(pattern)
for link_text, attrs in self.a_tags:
if sre.match(link_text):
if url is None:
return True
try:
href = attrs['href']
if href == url:
return True
except KeyError:
continue
return False
VALMAP = {None: None,
'dtd': DTDValidator,
'xml': XMLValidator,
'html': HTMLValidator,
}