devtools/htmlparser.py
branchstable
changeset 7014 7e3e80f4179a
parent 6977 cb78108bf603
child 8695 358d8bed9626
equal deleted inserted replaced
7012:5ff6cb8bd2b3 7014:7e3e80f4179a
   125         self.h3_tags = self.find_tag('h3')
   125         self.h3_tags = self.find_tag('h3')
   126         self.h4_tags = self.find_tag('h4')
   126         self.h4_tags = self.find_tag('h4')
   127         self.input_tags = self.find_tag('input')
   127         self.input_tags = self.find_tag('input')
   128         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
   128         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
   129 
   129 
   130     def iterstr(self, tag):
   130     def _iterstr(self, tag):
   131         if self.default_ns is None:
   131         if self.default_ns is None:
   132             return ".//%s" % tag
   132             return ".//%s" % tag
   133         else:
   133         else:
   134             return ".//{%s}%s" % (self.default_ns, tag)
   134             return ".//{%s}%s" % (self.default_ns, tag)
   135 
   135 
   136     def find_tag(self, tag, gettext=True):
   136     def matching_nodes(self, tag, **attrs):
   137         """return a list which contains text of all "tag" elements """
   137         for elt in self.etree.iterfind(self._iterstr(tag)):
   138         iterstr = self.iterstr(tag)
       
   139         if not gettext or tag in ('a', 'input'):
       
   140             return [(elt.text, elt.attrib)
       
   141                     for elt in self.etree.iterfind(iterstr)]
       
   142         return [u''.join(elt.xpath('.//text()'))
       
   143                 for elt in self.etree.iterfind(iterstr)]
       
   144 
       
   145     def appears(self, text):
       
   146         """returns True if <text> appears in the page"""
       
   147         return text in self.raw_text
       
   148 
       
   149     def has_tag(self, tag, nboccurs=1, **attrs):
       
   150         """returns True if tag with given attributes appears in the page
       
   151         `nbtimes` (any if None)
       
   152         """
       
   153         for elt in self.etree.iterfind(self.iterstr(tag)):
       
   154             eltattrs  = elt.attrib
   138             eltattrs  = elt.attrib
   155             for attr, value in attrs.iteritems():
   139             for attr, value in attrs.iteritems():
   156                 try:
   140                 try:
   157                     if eltattrs[attr] != value:
   141                     if eltattrs[attr] != value:
   158                         break
   142                         break
   159                 except KeyError:
   143                 except KeyError:
   160                     break
   144                     break
   161             else: # all attributes match
   145             else: # all attributes match
   162                 if nboccurs is None: # no need to check number of occurences
   146                 yield elt
   163                     return True
   147 
   164                 if not nboccurs: # too much occurences
   148     def has_tag(self, tag, nboccurs=1, **attrs):
   165                     return False
   149         """returns True if tag with given attributes appears in the page
   166                 nboccurs -= 1
   150         `nbtimes` (any if None)
       
   151         """
       
   152         for elt in self.matching_nodes(tag, **attrs):
       
   153             if nboccurs is None: # no need to check number of occurences
       
   154                 return True
       
   155             if not nboccurs: # too much occurences
       
   156                 return False
       
   157             nboccurs -= 1
   167         if nboccurs == 0: # correct number of occurences
   158         if nboccurs == 0: # correct number of occurences
   168             return True
   159             return True
   169         return False # no matching tag/attrs
   160         return False # no matching tag/attrs
       
   161 
       
   162     def find_tag(self, tag, gettext=True):
       
   163         """return a list which contains text of all "tag" elements """
       
   164         iterstr = self._iterstr(tag)
       
   165         if not gettext or tag in ('a', 'input'):
       
   166             return [(elt.text, elt.attrib)
       
   167                     for elt in self.etree.iterfind(iterstr)]
       
   168         return [u''.join(elt.xpath('.//text()'))
       
   169                 for elt in self.etree.iterfind(iterstr)]
       
   170 
       
   171     def appears(self, text):
       
   172         """returns True if <text> appears in the page"""
       
   173         return text in self.raw_text
   170 
   174 
   171     def __contains__(self, text):
   175     def __contains__(self, text):
   172         return text in self.source
   176         return text in self.source
   173 
   177 
   174     def has_title(self, text, level=None):
   178     def has_title(self, text, level=None):