devtools/htmlparser.py
branchstable
changeset 6977 cb78108bf603
parent 6772 68bb0943d192
child 7014 7e3e80f4179a
equal deleted inserted replaced
6976:1e0cd8cfa987 6977:cb78108bf603
   125         self.h3_tags = self.find_tag('h3')
   125         self.h3_tags = self.find_tag('h3')
   126         self.h4_tags = self.find_tag('h4')
   126         self.h4_tags = self.find_tag('h4')
   127         self.input_tags = self.find_tag('input')
   127         self.input_tags = self.find_tag('input')
   128         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
   128         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
   129 
   129 
       
   130     def iterstr(self, tag):
       
   131         if self.default_ns is None:
       
   132             return ".//%s" % tag
       
   133         else:
       
   134             return ".//{%s}%s" % (self.default_ns, tag)
       
   135 
   130     def find_tag(self, tag, gettext=True):
   136     def find_tag(self, tag, gettext=True):
   131         """return a list which contains text of all "tag" elements """
   137         """return a list which contains text of all "tag" elements """
   132         if self.default_ns is None:
   138         iterstr = self.iterstr(tag)
   133             iterstr = ".//%s" % tag
       
   134         else:
       
   135             iterstr = ".//{%s}%s" % (self.default_ns, tag)
       
   136         if not gettext or tag in ('a', 'input'):
   139         if not gettext or tag in ('a', 'input'):
   137             return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
   140             return [(elt.text, elt.attrib)
   138         return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
   141                     for elt in self.etree.iterfind(iterstr)]
       
   142         return [u''.join(elt.xpath('.//text()'))
       
   143                 for elt in self.etree.iterfind(iterstr)]
   139 
   144 
   140     def appears(self, text):
   145     def appears(self, text):
   141         """returns True if <text> appears in the page"""
   146         """returns True if <text> appears in the page"""
   142         return text in self.raw_text
   147         return text in self.raw_text
       
   148 
       
   149     def has_tag(self, tag, nboccurs=1, **attrs):
       
   150         """returns True if tag with given attributes appears in the page
       
   151         `nbtimes` (any if None)
       
   152         """
       
   153         for elt in self.etree.iterfind(self.iterstr(tag)):
       
   154             eltattrs  = elt.attrib
       
   155             for attr, value in attrs.iteritems():
       
   156                 try:
       
   157                     if eltattrs[attr] != value:
       
   158                         break
       
   159                 except KeyError:
       
   160                     break
       
   161             else: # all attributes match
       
   162                 if nboccurs is None: # no need to check number of occurences
       
   163                     return True
       
   164                 if not nboccurs: # too much occurences
       
   165                     return False
       
   166                 nboccurs -= 1
       
   167         if nboccurs == 0: # correct number of occurences
       
   168             return True
       
   169         return False # no matching tag/attrs
   143 
   170 
   144     def __contains__(self, text):
   171     def __contains__(self, text):
   145         return text in self.source
   172         return text in self.source
   146 
   173 
   147     def has_title(self, text, level=None):
   174     def has_title(self, text, level=None):