--- a/uilib.py Fri Nov 05 14:28:22 2010 +0100
+++ b/uilib.py Fri Oct 22 16:14:48 2010 +0200
@@ -109,12 +109,6 @@
return u''
return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text)
-# fallback implementation, nicer one defined below if lxml is available
-def soup2xhtml(data, encoding):
- # normalize line break
- # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
- return u'\n'.join(data.splitlines())
-
# fallback implementation, nicer one defined below if lxml> 2.0 is available
def safe_cut(text, length):
"""returns a string of length <length> based on <text>, removing any html
@@ -132,24 +126,29 @@
fallback_safe_cut = safe_cut
REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
+
try:
- from lxml import etree
-except (ImportError, AttributeError):
- # gae environment: lxml not available
- pass
-else:
+ from lxml import etree, html
+ from lxml.html import clean, defs
+
+ ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
+ defs.phrase_tags | defs.font_style_tags |
+ set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
+ )
+
+ CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
+ style=True, safe_attrs_only=True,
+ add_nofollow=False,
+ )
def soup2xhtml(data, encoding):
- """tidy (at least try) html soup and return the result
-
- Note: the function considers a string with no surrounding tag as valid
- if <div>`data`</div> can be parsed by an XML parser
+ """tidy html soup by allowing some element tags and return the result
"""
# remove spurious </body> and </html> tags, then normalize line break
# (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
# XXX lxml 1.1 support still needed ?
- xmltree = etree.HTML('<div>%s</div>' % data)
+ xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
# NOTE: lxml 1.1 (etch platforms) doesn't recognize
# the encoding=unicode parameter (lxml 2.0 does), this is
# why we specify an encoding and re-decode to unicode later
@@ -163,7 +162,29 @@
snippet = snippet[5:-6]
return snippet
- if hasattr(etree.HTML('<div>test</div>'), 'iter'):
+ # lxml.Cleaner envelops text elements by internal logic (not accessible)
+ # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+ # TODO drop attributes in elements
+ # TODO add policy configuration (content only, embedded content, ...)
+ # XXX this is buggy for "<p>text1</p><p>text2</p>"...
+ # XXX drop these two snippets action and follow the lxml behaviour
+ # XXX (tests need to be updated)
+ # if snippet.startswith('<div>') and snippet.endswith('</div>'):
+ # snippet = snippet[5:-6]
+ # if snippet.startswith('<p>') and snippet.endswith('</p>'):
+ # snippet = snippet[3:-4]
+ return snippet.decode(encoding)
+
+except (ImportError, AttributeError):
+ # gae environment: lxml not available
+ # fallback implementation
+ def soup2xhtml(data, encoding):
+ # normalize line break
+ # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+ return u'\n'.join(data.splitlines())
+else:
+
+ if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
def safe_cut(text, length):
"""returns an html document of length <length> based on <text>,