cubicweb: comparison uilib.py

equal deleted inserted replaced

-:6cebeb1f386a
+:ad0eeb0f7a8d
 fallback_safe_cut = safe_cut
 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
-try:
+from lxml import etree, html
-from lxml import etree, html
+from lxml.html import clean, defs
-from lxml.html import clean, defs
+ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
-ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
+defs.phrase_tags | defs.font_style_tags |
-defs.phrase_tags | defs.font_style_tags |
+set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
-set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
+)
-)
+CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
-CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
+style=True, safe_attrs_only=True,
-style=True, safe_attrs_only=True,
+add_nofollow=False,
-add_nofollow=False,
+)
-)
+def soup2xhtml(data, encoding):
-def soup2xhtml(data, encoding):
+"""tidy html soup by allowing some element tags and return the result
-"""tidy html soup by allowing some element tags and return the result
+"""
+# remove spurious </body> and </html> tags, then normalize line break
+# (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
+data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
+xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
+# NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
+# tried I got weird results (lxml 2.2.8)
+body = etree.tostring(xmltree[0], encoding=encoding)
+# remove <body> and </body> and decode to unicode
+snippet = body[6:-7].decode(encoding)
+# take care to bad xhtml (for instance starting with </div>) which
+# may mess with the <div> we added below. Only remove it if it's
+# still there...
+if snippet.startswith('<div>') and snippet.endswith('</div>'):
+snippet = snippet[5:-6]
+return snippet
+# lxml.Cleaner envelops text elements by internal logic (not accessible)
+# see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+# TODO drop attributes in elements
+# TODO add policy configuration (content only, embedded content, ...)
+# XXX this is buggy for "<p>text1</p><p>text2</p>"...
+# XXX drop these two snippets action and follow the lxml behaviour
+# XXX (tests need to be updated)
+# if snippet.startswith('<div>') and snippet.endswith('</div>'):
+#     snippet = snippet[5:-6]
+# if snippet.startswith('<p>') and snippet.endswith('</p>'):
+#     snippet = snippet[3:-4]
+return snippet.decode(encoding)
+if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
+# pylint: disable=E0102
+def safe_cut(text, length):
+"""returns an html document of length <length> based on <text>,
+and cut is necessary.
 """
-# remove spurious </body> and </html> tags, then normalize line break
+if text is None:
-# (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
+return u''
-data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
+dom = etree.HTML(text)
-xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
+curlength = 0
-# NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
+add_ellipsis = False
-# tried I got weird results (lxml 2.2.8)
+for element in dom.iter():
-body = etree.tostring(xmltree[0], encoding=encoding)
+if curlength >= length:
-# remove <body> and </body> and decode to unicode
+parent = element.getparent()
-snippet = body[6:-7].decode(encoding)
+parent.remove(element)
-# take care to bad xhtml (for instance starting with </div>) which
+if curlength == length and (element.text or element.tail):
-# may mess with the <div> we added below. Only remove it if it's
+add_ellipsis = True
-# still there...
+else:
-if snippet.startswith('<div>') and snippet.endswith('</div>'):
+if element.text is not None:
-snippet = snippet[5:-6]
+element.text = cut(element.text, length - curlength)
-return snippet
+curlength += len(element.text)
+if element.tail is not None:
-# lxml.Cleaner envelops text elements by internal logic (not accessible)
+if curlength < length:
-# see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+element.tail = cut(element.tail, length - curlength)
-# TODO drop attributes in elements
+curlength += len(element.tail)
-# TODO add policy configuration (content only, embedded content, ...)
+elif curlength == length:
-# XXX this is buggy for "<p>text1</p><p>text2</p>"...
+element.tail = '...'
-# XXX drop these two snippets action and follow the lxml behaviour
+else:
-# XXX (tests need to be updated)
+element.tail = ''
-# if snippet.startswith('<div>') and snippet.endswith('</div>'):
+text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body>
-#     snippet = snippet[5:-6]
+if add_ellipsis:
-# if snippet.startswith('<p>') and snippet.endswith('</p>'):
+return text + u'...'
-#     snippet = snippet[3:-4]
+return text
-return snippet.decode(encoding)
-except (ImportError, AttributeError):
-# gae environment: lxml not available
-# fallback implementation
-def soup2xhtml(data, encoding):
-# normalize line break
-# see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-return u'\n'.join(data.splitlines())
-else:
-if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
-def safe_cut(text, length):
-"""returns an html document of length <length> based on <text>,
-and cut is necessary.
-"""
-if text is None:
-return u''
-dom = etree.HTML(text)
-curlength = 0
-add_ellipsis = False
-for element in dom.iter():
-if curlength >= length:
-parent = element.getparent()
-parent.remove(element)
-if curlength == length and (element.text or element.tail):
-add_ellipsis = True
-else:
-if element.text is not None:
-element.text = cut(element.text, length - curlength)
-curlength += len(element.text)
-if element.tail is not None:
-if curlength < length:
-element.tail = cut(element.tail, length - curlength)
-curlength += len(element.tail)
-elif curlength == length:
-element.tail = '...'
-else:
-element.tail = ''
-text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body>
-if add_ellipsis:
-return text + u'...'
-return text
 def text_cut(text, nbwords=30, gotoperiod=True):
 """from the given plain text, return a text with at least <nbwords> words,
 trying to go to the end of the current sentence.

changeset 7894	ad0eeb0f7a8d
parent 7879	9aae456abab5
child 7913	d0c6a7993cec