uilib.py
changeset 7894 ad0eeb0f7a8d
parent 7879 9aae456abab5
child 7913 d0c6a7993cec
equal deleted inserted replaced
7889:6cebeb1f386a 7894:ad0eeb0f7a8d
   159 
   159 
   160 fallback_safe_cut = safe_cut
   160 fallback_safe_cut = safe_cut
   161 
   161 
   162 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
   162 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
   163 
   163 
   164 try:
   164 from lxml import etree, html
   165     from lxml import etree, html
   165 from lxml.html import clean, defs
   166     from lxml.html import clean, defs
   166 
   167 
   167 ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
   168     ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
   168                 defs.phrase_tags | defs.font_style_tags |
   169                     defs.phrase_tags | defs.font_style_tags |
   169                 set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
   170                     set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
   170                 )
   171                     )
   171 
   172 
   172 CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
   173     CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
   173                         style=True, safe_attrs_only=True,
   174                             style=True, safe_attrs_only=True,
   174                         add_nofollow=False,
   175                             add_nofollow=False,
   175                         )
   176                             )
   176 
   177 
   177 def soup2xhtml(data, encoding):
   178     def soup2xhtml(data, encoding):
   178     """tidy html soup by allowing some element tags and return the result
   179         """tidy html soup by allowing some element tags and return the result
   179     """
       
   180     # remove spurious </body> and </html> tags, then normalize line break
       
   181     # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
       
   182     data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
       
   183     xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
       
   184     # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
       
   185     # tried I got weird results (lxml 2.2.8)
       
   186     body = etree.tostring(xmltree[0], encoding=encoding)
       
   187     # remove <body> and </body> and decode to unicode
       
   188     snippet = body[6:-7].decode(encoding)
       
   189     # take care to bad xhtml (for instance starting with </div>) which
       
   190     # may mess with the <div> we added below. Only remove it if it's
       
   191     # still there...
       
   192     if snippet.startswith('<div>') and snippet.endswith('</div>'):
       
   193         snippet = snippet[5:-6]
       
   194     return snippet
       
   195 
       
   196     # lxml.Cleaner envelops text elements by internal logic (not accessible)
       
   197     # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
       
   198     # TODO drop attributes in elements
       
   199     # TODO add policy configuration (content only, embedded content, ...)
       
   200     # XXX this is buggy for "<p>text1</p><p>text2</p>"...
       
   201     # XXX drop these two snippets action and follow the lxml behaviour
       
   202     # XXX (tests need to be updated)
       
   203     # if snippet.startswith('<div>') and snippet.endswith('</div>'):
       
   204     #     snippet = snippet[5:-6]
       
   205     # if snippet.startswith('<p>') and snippet.endswith('</p>'):
       
   206     #     snippet = snippet[3:-4]
       
   207     return snippet.decode(encoding)
       
   208 
       
   209 if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
       
   210     # pylint: disable=E0102
       
   211     def safe_cut(text, length):
       
   212         """returns an html document of length <length> based on <text>,
       
   213         and cut is necessary.
   180         """
   214         """
   181         # remove spurious </body> and </html> tags, then normalize line break
   215         if text is None:
   182         # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
   216             return u''
   183         data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
   217         dom = etree.HTML(text)
   184         xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
   218         curlength = 0
   185         # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
   219         add_ellipsis = False
   186         # tried I got weird results (lxml 2.2.8)
   220         for element in dom.iter():
   187         body = etree.tostring(xmltree[0], encoding=encoding)
   221             if curlength >= length:
   188         # remove <body> and </body> and decode to unicode
   222                 parent = element.getparent()
   189         snippet = body[6:-7].decode(encoding)
   223                 parent.remove(element)
   190         # take care to bad xhtml (for instance starting with </div>) which
   224                 if curlength == length and (element.text or element.tail):
   191         # may mess with the <div> we added below. Only remove it if it's
   225                     add_ellipsis = True
   192         # still there...
   226             else:
   193         if snippet.startswith('<div>') and snippet.endswith('</div>'):
   227                 if element.text is not None:
   194             snippet = snippet[5:-6]
   228                     element.text = cut(element.text, length - curlength)
   195         return snippet
   229                     curlength += len(element.text)
   196 
   230                 if element.tail is not None:
   197         # lxml.Cleaner envelops text elements by internal logic (not accessible)
   231                     if curlength < length:
   198         # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
   232                         element.tail = cut(element.tail, length - curlength)
   199         # TODO drop attributes in elements
   233                         curlength += len(element.tail)
   200         # TODO add policy configuration (content only, embedded content, ...)
   234                     elif curlength == length:
   201         # XXX this is buggy for "<p>text1</p><p>text2</p>"...
   235                         element.tail = '...'
   202         # XXX drop these two snippets action and follow the lxml behaviour
   236                     else:
   203         # XXX (tests need to be updated)
   237                         element.tail = ''
   204         # if snippet.startswith('<div>') and snippet.endswith('</div>'):
   238         text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body>
   205         #     snippet = snippet[5:-6]
   239         if add_ellipsis:
   206         # if snippet.startswith('<p>') and snippet.endswith('</p>'):
   240             return text + u'...'
   207         #     snippet = snippet[3:-4]
   241         return text
   208         return snippet.decode(encoding)
       
   209 
       
   210 except (ImportError, AttributeError):
       
   211     # gae environment: lxml not available
       
   212     # fallback implementation
       
   213     def soup2xhtml(data, encoding):
       
   214         # normalize line break
       
   215         # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
       
   216         return u'\n'.join(data.splitlines())
       
   217 else:
       
   218 
       
   219     if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
       
   220 
       
   221         def safe_cut(text, length):
       
   222             """returns an html document of length <length> based on <text>,
       
   223             and cut is necessary.
       
   224             """
       
   225             if text is None:
       
   226                 return u''
       
   227             dom = etree.HTML(text)
       
   228             curlength = 0
       
   229             add_ellipsis = False
       
   230             for element in dom.iter():
       
   231                 if curlength >= length:
       
   232                     parent = element.getparent()
       
   233                     parent.remove(element)
       
   234                     if curlength == length and (element.text or element.tail):
       
   235                         add_ellipsis = True
       
   236                 else:
       
   237                     if element.text is not None:
       
   238                         element.text = cut(element.text, length - curlength)
       
   239                         curlength += len(element.text)
       
   240                     if element.tail is not None:
       
   241                         if curlength < length:
       
   242                             element.tail = cut(element.tail, length - curlength)
       
   243                             curlength += len(element.tail)
       
   244                         elif curlength == length:
       
   245                             element.tail = '...'
       
   246                         else:
       
   247                             element.tail = ''
       
   248             text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body>
       
   249             if add_ellipsis:
       
   250                 return text + u'...'
       
   251             return text
       
   252 
   242 
   253 def text_cut(text, nbwords=30, gotoperiod=True):
   243 def text_cut(text, nbwords=30, gotoperiod=True):
   254     """from the given plain text, return a text with at least <nbwords> words,
   244     """from the given plain text, return a text with at least <nbwords> words,
   255     trying to go to the end of the current sentence.
   245     trying to go to the end of the current sentence.
   256 
   246