devtools/htmlparser.py
branchstable
changeset 1945 2b59d9ae17ae
parent 1485 4d532f3c012e
child 1977 606923dff11b
equal deleted inserted replaced
1944:a1b1d4f8482c 1945:2b59d9ae17ae
    86         self.h3_tags = self.find_tag('h3')
    86         self.h3_tags = self.find_tag('h3')
    87         self.h4_tags = self.find_tag('h4')
    87         self.h4_tags = self.find_tag('h4')
    88         self.input_tags = self.find_tag('input')
    88         self.input_tags = self.find_tag('input')
    89         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
    89         self.title_tags = [self.h1_tags, self.h2_tags, self.h3_tags, self.h4_tags]
    90 
    90 
    91     def find_tag(self, tag):
    91     def find_tag(self, tag, gettext=True):
    92         """return a list which contains text of all "tag" elements """
    92         """return a list which contains text of all "tag" elements """
    93         if self.default_ns is None:
    93         if self.default_ns is None:
    94             iterstr = ".//%s" % tag
    94             iterstr = ".//%s" % tag
    95         else:
    95         else:
    96             iterstr = ".//{%s}%s" % (self.default_ns, tag)
    96             iterstr = ".//{%s}%s" % (self.default_ns, tag)
    97         if tag in ('a', 'input'):
    97         if not gettext or tag in ('a', 'input'):
    98             return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
    98             return [(elt.text, elt.attrib) for elt in self.etree.iterfind(iterstr)]
    99         return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
    99         return [u''.join(elt.xpath('.//text()')) for elt in self.etree.iterfind(iterstr)]
   100 
   100 
   101     def appears(self, text):
   101     def appears(self, text):
   102         """returns True if <text> appears in the page"""
   102         """returns True if <text> appears in the page"""