devtools/htmlparser.py
changeset 8973 6711f78c18be
parent 8972 771337c3a754
child 8977 57e564c0118e
equal deleted inserted replaced
8972:771337c3a754 8973:6711f78c18be
    30 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    30 TRANSITIONAL_DOCTYPE = str(TRANSITIONAL_DOCTYPE)
    31 
    31 
    32 ERR_COUNT = 0
    32 ERR_COUNT = 0
    33 
    33 
    34 class Validator(object):
    34 class Validator(object):
    35 
    35     parser = None
    36     def parse_string(self, data, sysid=None):
    36 
       
    37     def parse_string(self, source):
       
    38         etree = self._parse(self.preprocess_data(source))
       
    39         return PageInfo(source, etree)
       
    40 
       
    41     def preprocess_data(self, data):
       
    42         return data
       
    43 
       
    44     def _parse(self, pdata):
    37         try:
    45         try:
    38             return PageInfo(self, data)
    46             return etree.fromstring(pdata, self.parser)
    39         except etree.XMLSyntaxError as exc:
    47         except etree.XMLSyntaxError as exc:
    40             def save_in(fname=''):
    48             def save_in(fname=''):
    41                 file(fname, 'w').write(data)
    49                 file(fname, 'w').write(data)
    42             new_exc = AssertionError(u'invalid document: %s' % exc)
    50             new_exc = AssertionError(u'invalid document: %s' % exc)
    43             new_exc.position = exc.position
    51             new_exc.position = exc.position
    44             raise new_exc
    52             raise new_exc
    45 
       
    46     def preprocess_data(self, data):
       
    47         return data
       
    48 
    53 
    49 
    54 
    50 class DTDValidator(Validator):
    55 class DTDValidator(Validator):
    51     def __init__(self):
    56     def __init__(self):
    52         Validator.__init__(self)
    57         Validator.__init__(self)
   153         return REM_SCRIPT_RGX.sub('', data)
   158         return REM_SCRIPT_RGX.sub('', data)
   154 
   159 
   155 
   160 
   156 class PageInfo(object):
   161 class PageInfo(object):
   157     """holds various informations on the view's output"""
   162     """holds various informations on the view's output"""
   158     def __init__(self, validator, source):
   163     def __init__(self, source, root):
   159         self.source = source
   164         self.source = source
   160         root = etree.fromstring(validator.preprocess_data(source), validator.parser)
       
   161         self.etree = root
   165         self.etree = root
   162         self.source = source
       
   163         self.raw_text = u''.join(root.xpath('//text()'))
   166         self.raw_text = u''.join(root.xpath('//text()'))
   164         self.namespace = self.etree.nsmap
   167         self.namespace = self.etree.nsmap
   165         self.default_ns = self.namespace.get(None)
   168         self.default_ns = self.namespace.get(None)
   166         self.a_tags = self.find_tag('a')
   169         self.a_tags = self.find_tag('a')
   167         self.h1_tags = self.find_tag('h1')
   170         self.h1_tags = self.find_tag('h1')