diff -r 6cebeb1f386a -r ad0eeb0f7a8d uilib.py --- a/uilib.py Thu Sep 29 14:07:37 2011 +0200 +++ b/uilib.py Thu Sep 29 14:47:04 2011 +0200 @@ -161,94 +161,84 @@ REM_ROOT_HTML_TAGS = re.compile('', re.U) -try: - from lxml import etree, html - from lxml.html import clean, defs +from lxml import etree, html +from lxml.html import clean, defs - ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags | - defs.phrase_tags | defs.font_style_tags | - set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup')) - ) +ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags | + defs.phrase_tags | defs.font_style_tags | + set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup')) + ) - CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False, - style=True, safe_attrs_only=True, - add_nofollow=False, - ) +CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False, + style=True, safe_attrs_only=True, + add_nofollow=False, + ) - def soup2xhtml(data, encoding): - """tidy html soup by allowing some element tags and return the result - """ - # remove spurious and tags, then normalize line break - # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) - data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) - xmltree = etree.HTML(CLEANER.clean_html('
%s
' % data)) - # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt) - # tried I got weird results (lxml 2.2.8) - body = etree.tostring(xmltree[0], encoding=encoding) - # remove and and decode to unicode - snippet = body[6:-7].decode(encoding) - # take care to bad xhtml (for instance starting with ) which - # may mess with the
we added below. Only remove it if it's - # still there... - if snippet.startswith('
') and snippet.endswith('
'): - snippet = snippet[5:-6] - return snippet +def soup2xhtml(data, encoding): + """tidy html soup by allowing some element tags and return the result + """ + # remove spurious and tags, then normalize line break + # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) + data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) + xmltree = etree.HTML(CLEANER.clean_html('
%s
' % data)) + # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt) + # tried I got weird results (lxml 2.2.8) + body = etree.tostring(xmltree[0], encoding=encoding) + # remove and and decode to unicode + snippet = body[6:-7].decode(encoding) + # take care to bad xhtml (for instance starting with
) which + # may mess with the
we added below. Only remove it if it's + # still there... + if snippet.startswith('
') and snippet.endswith('
'): + snippet = snippet[5:-6] + return snippet - # lxml.Cleaner envelops text elements by internal logic (not accessible) - # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 - # TODO drop attributes in elements - # TODO add policy configuration (content only, embedded content, ...) - # XXX this is buggy for "

text1

text2

"... - # XXX drop these two snippets action and follow the lxml behaviour - # XXX (tests need to be updated) - # if snippet.startswith('
') and snippet.endswith('
'): - # snippet = snippet[5:-6] - # if snippet.startswith('

') and snippet.endswith('

'): - # snippet = snippet[3:-4] - return snippet.decode(encoding) - -except (ImportError, AttributeError): - # gae environment: lxml not available - # fallback implementation - def soup2xhtml(data, encoding): - # normalize line break - # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 - return u'\n'.join(data.splitlines()) -else: - - if hasattr(etree.HTML('
test
'), 'iter'): # XXX still necessary? + # lxml.Cleaner envelops text elements by internal logic (not accessible) + # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 + # TODO drop attributes in elements + # TODO add policy configuration (content only, embedded content, ...) + # XXX this is buggy for "

text1

text2

"... + # XXX drop these two snippets action and follow the lxml behaviour + # XXX (tests need to be updated) + # if snippet.startswith('
') and snippet.endswith('
'): + # snippet = snippet[5:-6] + # if snippet.startswith('

') and snippet.endswith('

'): + # snippet = snippet[3:-4] + return snippet.decode(encoding) - def safe_cut(text, length): - """returns an html document of length based on , - and cut is necessary. - """ - if text is None: - return u'' - dom = etree.HTML(text) - curlength = 0 - add_ellipsis = False - for element in dom.iter(): - if curlength >= length: - parent = element.getparent() - parent.remove(element) - if curlength == length and (element.text or element.tail): - add_ellipsis = True - else: - if element.text is not None: - element.text = cut(element.text, length - curlength) - curlength += len(element.text) - if element.tail is not None: - if curlength < length: - element.tail = cut(element.tail, length - curlength) - curlength += len(element.tail) - elif curlength == length: - element.tail = '...' - else: - element.tail = '' - text = etree.tounicode(dom[0])[6:-7] # remove wrapping - if add_ellipsis: - return text + u'...' - return text +if hasattr(etree.HTML('
test
'), 'iter'): # XXX still necessary? + # pylint: disable=E0102 + def safe_cut(text, length): + """returns an html document of length based on , + and cut is necessary. + """ + if text is None: + return u'' + dom = etree.HTML(text) + curlength = 0 + add_ellipsis = False + for element in dom.iter(): + if curlength >= length: + parent = element.getparent() + parent.remove(element) + if curlength == length and (element.text or element.tail): + add_ellipsis = True + else: + if element.text is not None: + element.text = cut(element.text, length - curlength) + curlength += len(element.text) + if element.tail is not None: + if curlength < length: + element.tail = cut(element.tail, length - curlength) + curlength += len(element.tail) + elif curlength == length: + element.tail = '...' + else: + element.tail = '' + text = etree.tounicode(dom[0])[6:-7] # remove wrapping + if add_ellipsis: + return text + u'...' + return text def text_cut(text, nbwords=30, gotoperiod=True): """from the given plain text, return a text with at least words,