AdaptedList -> SameETypeList
*NO BW COMPAT*, benefit from cw 3.6 releasing of folder,file and blog
which use it to get update at the same time.
CMHN and PEGASE will need update (but won't go to 3.6 without update,
so seem fine).
"""defines a validating HTML parser used in web application tests:organization: Logilab:copyright: 2001-2010 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2.:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses"""importreimportsysfromlxmlimportetreefromcubicweb.viewimportSTRICT_DOCTYPE,TRANSITIONAL_DOCTYPESTRICT_DOCTYPE=str(STRICT_DOCTYPE)TRANSITIONAL_DOCTYPE=str(TRANSITIONAL_DOCTYPE)ERR_COUNT=0classValidator(object):defparse_string(self,data,sysid=None):try:data=self.preprocess_data(data)returnPageInfo(data,etree.fromstring(data,self.parser))exceptetree.XMLSyntaxError,exc:defsave_in(fname=''):file(fname,'w').write(data)new_exc=AssertionError(u'invalid xml %s'%exc)new_exc.position=exc.positionraisenew_excdefpreprocess_data(self,data):returndataclassDTDValidator(Validator):def__init__(self):Validator.__init__(self)# XXX understand what's happening under windowsvalidate=Trueifsys.platform=='win32':validate=Falseself.parser=etree.XMLParser(dtd_validation=validate)defpreprocess_data(self,data):"""used to fix potential blockquote mess generated by docutils"""ifSTRICT_DOCTYPEnotindata:returndata# parse using transitional DTDdata=data.replace(STRICT_DOCTYPE,TRANSITIONAL_DOCTYPE)tree=etree.fromstring(data,self.parser)namespace=tree.nsmap.get(None)# this is the list of authorized child tags for <blockquote> nodesexpected='p h1 h2 h3 h4 h5 h6 div ul ol dl pre hr blockquote address ' \'fieldset table form noscript ins del script'.split()ifnamespace:blockquotes=tree.findall('.//{%s}blockquote'%namespace)expected=['{%s}%s'%(namespace,tag)fortaginexpected]else:blockquotes=tree.findall('.//blockquote')# quick and dirty approach: remove all blockquotesforblockquoteinblockquotes:parent=blockquote.getparent()parent.remove(blockquote)data=etree.tostring(tree)return'<?xml version="1.0" encoding="UTF-8"?>%s\n%s'%(STRICT_DOCTYPE,data)classSaxOnlyValidator(Validator):def__init__(self):Validator.__init__(self)self.parser=etree.XMLParser()classHTMLValidator(Validator):def__init__(self):Validator.__init__(self)self.parser=etree.HTMLParser()classPageInfo(object):"""holds various informations on the view's output"""def__init__(self,source,root):self.source=sourceself.etree=rootself.source=sourceself.raw_text=u''.join(root.xpath('//text()'))self.namespace=self.etree.nsmapself.default_ns=self.namespace.get(None)self.a_tags=self.find_tag('a')self.h1_tags=self.find_tag('h1')self.h2_tags=self.find_tag('h2')self.h3_tags=self.find_tag('h3')self.h4_tags=self.find_tag('h4')self.input_tags=self.find_tag('input')self.title_tags=[self.h1_tags,self.h2_tags,self.h3_tags,self.h4_tags]deffind_tag(self,tag,gettext=True):"""return a list which contains text of all "tag" elements """ifself.default_nsisNone:iterstr=".//%s"%tagelse:iterstr=".//{%s}%s"%(self.default_ns,tag)ifnotgettextortagin('a','input'):return[(elt.text,elt.attrib)foreltinself.etree.iterfind(iterstr)]return[u''.join(elt.xpath('.//text()'))foreltinself.etree.iterfind(iterstr)]defappears(self,text):"""returns True if <text> appears in the page"""returntextinself.raw_textdef__contains__(self,text):returntextinself.sourcedefhas_title(self,text,level=None):"""returns True if <h?>text</h?> :param level: the title's level (1 for h1, 2 for h2, etc.) """iflevelisNone:forhlistinself.title_tags:iftextinhlist:returnTruereturnFalseelse:hlist=self.title_tags[level-1]returntextinhlistdefhas_title_regexp(self,pattern,level=None):"""returns True if <h?>pattern</h?>"""sre=re.compile(pattern)iflevelisNone:forhlistinself.title_tags:fortitleinhlist:ifsre.match(title):returnTruereturnFalseelse:hlist=self.title_tags[level-1]fortitleinhlist:ifsre.match(title):returnTruereturnFalsedefhas_link(self,text,url=None):"""returns True if <a href=url>text</a> was found in the page"""forlink_text,attrsinself.a_tags:iftext==link_text:ifurlisNone:returnTruetry:href=attrs['href']ifhref==url:returnTrueexceptKeyError:continuereturnFalsedefhas_link_regexp(self,pattern,url=None):"""returns True if <a href=url>pattern</a> was found in the page"""sre=re.compile(pattern)forlink_text,attrsinself.a_tags:ifsre.match(link_text):ifurlisNone:returnTruetry:href=attrs['href']ifhref==url:returnTrueexceptKeyError:continuereturnFalseVALMAP={None:None,'dtd':DTDValidator,'xml':SaxOnlyValidator}