diff -r b8bd0ecced2e -r eeedb3575d25 uilib.py --- a/uilib.py Fri Nov 05 14:28:22 2010 +0100 +++ b/uilib.py Fri Oct 22 16:14:48 2010 +0200 @@ -109,12 +109,6 @@ return u'' return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text) -# fallback implementation, nicer one defined below if lxml is available -def soup2xhtml(data, encoding): - # normalize line break - # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 - return u'\n'.join(data.splitlines()) - # fallback implementation, nicer one defined below if lxml> 2.0 is available def safe_cut(text, length): """returns a string of length based on , removing any html @@ -132,24 +126,29 @@ fallback_safe_cut = safe_cut REM_ROOT_HTML_TAGS = re.compile('', re.U) + try: - from lxml import etree -except (ImportError, AttributeError): - # gae environment: lxml not available - pass -else: + from lxml import etree, html + from lxml.html import clean, defs + + ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags | + defs.phrase_tags | defs.font_style_tags | + set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup')) + ) + + CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False, + style=True, safe_attrs_only=True, + add_nofollow=False, + ) def soup2xhtml(data, encoding): - """tidy (at least try) html soup and return the result - - Note: the function considers a string with no surrounding tag as valid - if
`data`
can be parsed by an XML parser + """tidy html soup by allowing some element tags and return the result """ # remove spurious and tags, then normalize line break # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) # XXX lxml 1.1 support still needed ? - xmltree = etree.HTML('
%s
' % data) + xmltree = etree.HTML(CLEANER.clean_html('
%s
' % data)) # NOTE: lxml 1.1 (etch platforms) doesn't recognize # the encoding=unicode parameter (lxml 2.0 does), this is # why we specify an encoding and re-decode to unicode later @@ -163,7 +162,29 @@ snippet = snippet[5:-6] return snippet - if hasattr(etree.HTML('
test
'), 'iter'): + # lxml.Cleaner envelops text elements by internal logic (not accessible) + # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 + # TODO drop attributes in elements + # TODO add policy configuration (content only, embedded content, ...) + # XXX this is buggy for "

text1

text2

"... + # XXX drop these two snippets action and follow the lxml behaviour + # XXX (tests need to be updated) + # if snippet.startswith('
') and snippet.endswith('
'): + # snippet = snippet[5:-6] + # if snippet.startswith('

') and snippet.endswith('

'): + # snippet = snippet[3:-4] + return snippet.decode(encoding) + +except (ImportError, AttributeError): + # gae environment: lxml not available + # fallback implementation + def soup2xhtml(data, encoding): + # normalize line break + # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 + return u'\n'.join(data.splitlines()) +else: + + if hasattr(etree.HTML('
test
'), 'iter'): # XXX still necessary? def safe_cut(text, length): """returns an html document of length based on ,