"""defines a validating HTML parser used in web application tests:organization: Logilab:copyright: 2001-2009 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2.:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses"""importrefromlxmlimportetreefromcubicweb.viewimportSTRICT_DOCTYPE,TRANSITIONAL_DOCTYPESTRICT_DOCTYPE=str(STRICT_DOCTYPE)TRANSITIONAL_DOCTYPE=str(TRANSITIONAL_DOCTYPE)ERR_COUNT=0classValidator(object):defparse_string(self,data,sysid=None):try:data=self.preprocess_data(data)returnPageInfo(data,etree.fromstring(data,self.parser))exceptetree.XMLSyntaxError,exc:defsave_in(fname=''):file(fname,'w').write(data)new_exc=AssertionError(u'invalid xml %s'%exc)new_exc.position=exc.positionraisenew_excdefpreprocess_data(self,data):returndataclassDTDValidator(Validator):def__init__(self):Validator.__init__(self)self.parser=etree.XMLParser(dtd_validation=True)defpreprocess_data(self,data):"""used to fix potential blockquote mess generated by docutils"""ifSTRICT_DOCTYPEnotindata:returndata# parse using transitional DTDdata=data.replace(STRICT_DOCTYPE,TRANSITIONAL_DOCTYPE)tree=etree.fromstring(data,self.parser)namespace=tree.nsmap.get(None)# this is the list of authorized child tags for <blockquote> nodesexpected='p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \'fieldset table form noscript ins del script'.split()ifnamespace:blockquotes=tree.findall('.//{%s}blockquote'%namespace)expected=['{%s}%s'%(namespace,tag)fortaginexpected]else:blockquotes=tree.findall('.//blockquote')# quick and dirty approach: remove all blockquotesforblockquoteinblockquotes:parent=blockquote.getparent()parent.remove(blockquote)data=etree.tostring(tree)return'<?xml version="1.0" encoding="UTF-8"?>%s\n%s'%(STRICT_DOCTYPE,data)classSaxOnlyValidator(Validator):def__init__(self):Validator.__init__(self)self.parser=etree.XMLParser()classHTMLValidator(Validator):def__init__(self):Validator.__init__(self)self.parser=etree.HTMLParser()classPageInfo(object):"""holds various informations on the view's output"""def__init__(self,source,root):self.source=sourceself.etree=rootself.source=sourceself.raw_text=u''.join(root.xpath('//text()'))self.namespace=self.etree.nsmapself.default_ns=self.namespace.get(None)self.a_tags=self.find_tag('a')self.h1_tags=self.find_tag('h1')self.h2_tags=self.find_tag('h2')self.h3_tags=self.find_tag('h3')self.h4_tags=self.find_tag('h4')self.input_tags=self.find_tag('input')self.title_tags=[self.h1_tags,self.h2_tags,self.h3_tags,self.h4_tags]deffind_tag(self,tag,gettext=True):"""return a list which contains text of all "tag" elements """ifself.default_nsisNone:iterstr=".//%s"%tagelse:iterstr=".//{%s}%s"%(self.default_ns,tag)ifnotgettextortagin('a','input'):return[(elt.text,elt.attrib)foreltinself.etree.iterfind(iterstr)]return[u''.join(elt.xpath('.//text()'))foreltinself.etree.iterfind(iterstr)]defappears(self,text):"""returns True if <text> appears in the page"""returntextinself.raw_textdef__contains__(self,text):returntextinself.sourcedefhas_title(self,text,level=None):"""returns True if <h?>text</h?> :param level: the title's level (1 for h1, 2 for h2, etc.) """iflevelisNone:forhlistinself.title_tags:iftextinhlist:returnTruereturnFalseelse:hlist=self.title_tags[level-1]returntextinhlistdefhas_title_regexp(self,pattern,level=None):"""returns True if <h?>pattern</h?>"""sre=re.compile(pattern)iflevelisNone:forhlistinself.title_tags:fortitleinhlist:ifsre.match(title):returnTruereturnFalseelse:hlist=self.title_tags[level-1]fortitleinhlist:ifsre.match(title):returnTruereturnFalsedefhas_link(self,text,url=None):"""returns True if <a href=url>text</a> was found in the page"""forlink_text,attrsinself.a_tags:iftext==link_text:ifurlisNone:returnTruetry:href=attrs['href']ifhref==url:returnTrueexceptKeyError:continuereturnFalsedefhas_link_regexp(self,pattern,url=None):"""returns True if <a href=url>pattern</a> was found in the page"""sre=re.compile(pattern)forlink_text,attrsinself.a_tags:ifsre.match(link_text):ifurlisNone:returnTruetry:href=attrs['href']ifhref==url:returnTrueexceptKeyError:continuereturnFalse