cubicweb: comparison uilib.py

equal deleted inserted replaced

-:55a94beb521d
+:b00f31b3b045
 """replace <ref rql=''> links by <a href="...">"""
 if not text:
 return u''
 return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text)
-# fallback implementation, nicer one defined below if lxml is available
-def soup2xhtml(data, encoding):
-# normalize line break
-# see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-return u'\n'.join(data.splitlines())
 # fallback implementation, nicer one defined below if lxml> 2.0 is available
 def safe_cut(text, length):
 """returns a string of length <length> based on <text>, removing any html
 tags from given text if cut is necessary."""
 if text is None:
 return xml_escape(text_nohtml[:length] + u'...')
 fallback_safe_cut = safe_cut
 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
 try:
-from lxml import etree
+from lxml import etree, html
-except (ImportError, AttributeError):
+from lxml.html import clean, defs
-# gae environment: lxml not available
-pass
+ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
-else:
+defs.phrase_tags | defs.font_style_tags |
+set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
+)
+CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
+style=True, safe_attrs_only=True,
+add_nofollow=False,
+)
 def soup2xhtml(data, encoding):
-"""tidy (at least try) html soup and return the result
+"""tidy html soup by allowing some element tags and return the result
-Note: the function considers a string with no surrounding tag as valid
-if <div>`data`</div> can be parsed by an XML parser
 """
 # remove spurious </body> and </html> tags, then normalize line break
 # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
 data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
-# XXX lxml 1.1 support still needed ?
+xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
-xmltree = etree.HTML('<div>%s</div>' % data)
+# NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
-# NOTE: lxml 1.1 (etch platforms) doesn't recognize
+# tried I got weird results (lxml 2.2.8)
-#       the encoding=unicode parameter (lxml 2.0 does), this is
-#       why we specify an encoding and re-decode to unicode later
 body = etree.tostring(xmltree[0], encoding=encoding)
 # remove <body> and </body> and decode to unicode
 snippet = body[6:-7].decode(encoding)
 # take care to bad xhtml (for instance starting with </div>) which
 # may mess with the <div> we added below. Only remove it if it's
 # still there...
 if snippet.startswith('<div>') and snippet.endswith('</div>'):
 snippet = snippet[5:-6]
 return snippet
-if hasattr(etree.HTML('<div>test</div>'), 'iter'):
+# lxml.Cleaner envelops text elements by internal logic (not accessible)
+# see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+# TODO drop attributes in elements
+# TODO add policy configuration (content only, embedded content, ...)
+# XXX this is buggy for "<p>text1</p><p>text2</p>"...
+# XXX drop these two snippets action and follow the lxml behaviour
+# XXX (tests need to be updated)
+# if snippet.startswith('<div>') and snippet.endswith('</div>'):
+#     snippet = snippet[5:-6]
+# if snippet.startswith('<p>') and snippet.endswith('</p>'):
+#     snippet = snippet[3:-4]
+return snippet.decode(encoding)
+except (ImportError, AttributeError):
+# gae environment: lxml not available
+# fallback implementation
+def soup2xhtml(data, encoding):
+# normalize line break
+# see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+return u'\n'.join(data.splitlines())
+else:
+if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
 def safe_cut(text, length):
 """returns an html document of length <length> based on <text>,
 and cut is necessary.
 """
 return res
 # traceback formatting ########################################################
 import traceback
+def exc_message(ex, encoding):
+try:
+return unicode(ex)
+except:
+try:
+return unicode(str(ex), encoding, 'replace')
+except:
+return unicode(repr(ex), encoding, 'replace')
 def rest_traceback(info, exception):
 """return a ReST formated traceback"""
 res = [u'Traceback\n---------\n::\n']
 for stackentry in traceback.extract_tb(info[2]):

changeset 6689	b00f31b3b045
parent 6688	51ddb4842c56
child 7058	ea22892e82d4