uilib.py
changeset 6689 b00f31b3b045
parent 6688 51ddb4842c56
child 7058 ea22892e82d4
equal deleted inserted replaced
6666:55a94beb521d 6689:b00f31b3b045
   107     """replace <ref rql=''> links by <a href="...">"""
   107     """replace <ref rql=''> links by <a href="...">"""
   108     if not text:
   108     if not text:
   109         return u''
   109         return u''
   110     return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text)
   110     return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text)
   111 
   111 
   112 # fallback implementation, nicer one defined below if lxml is available
       
   113 def soup2xhtml(data, encoding):
       
   114     # normalize line break
       
   115     # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
       
   116     return u'\n'.join(data.splitlines())
       
   117 
       
   118 # fallback implementation, nicer one defined below if lxml> 2.0 is available
   112 # fallback implementation, nicer one defined below if lxml> 2.0 is available
   119 def safe_cut(text, length):
   113 def safe_cut(text, length):
   120     """returns a string of length <length> based on <text>, removing any html
   114     """returns a string of length <length> based on <text>, removing any html
   121     tags from given text if cut is necessary."""
   115     tags from given text if cut is necessary."""
   122     if text is None:
   116     if text is None:
   130     return xml_escape(text_nohtml[:length] + u'...')
   124     return xml_escape(text_nohtml[:length] + u'...')
   131 
   125 
   132 fallback_safe_cut = safe_cut
   126 fallback_safe_cut = safe_cut
   133 
   127 
   134 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
   128 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U)
       
   129 
   135 try:
   130 try:
   136     from lxml import etree
   131     from lxml import etree, html
   137 except (ImportError, AttributeError):
   132     from lxml.html import clean, defs
   138     # gae environment: lxml not available
   133 
   139     pass
   134     ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags |
   140 else:
   135                     defs.phrase_tags | defs.font_style_tags |
       
   136                     set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup'))
       
   137                     )
       
   138 
       
   139     CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False,
       
   140                             style=True, safe_attrs_only=True,
       
   141                             add_nofollow=False,
       
   142                             )
   141 
   143 
   142     def soup2xhtml(data, encoding):
   144     def soup2xhtml(data, encoding):
   143         """tidy (at least try) html soup and return the result
   145         """tidy html soup by allowing some element tags and return the result
   144 
       
   145         Note: the function considers a string with no surrounding tag as valid
       
   146               if <div>`data`</div> can be parsed by an XML parser
       
   147         """
   146         """
   148         # remove spurious </body> and </html> tags, then normalize line break
   147         # remove spurious </body> and </html> tags, then normalize line break
   149         # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
   148         # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1)
   150         data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
   149         data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines()))
   151         # XXX lxml 1.1 support still needed ?
   150         xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data))
   152         xmltree = etree.HTML('<div>%s</div>' % data)
   151         # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt)
   153         # NOTE: lxml 1.1 (etch platforms) doesn't recognize
   152         # tried I got weird results (lxml 2.2.8)
   154         #       the encoding=unicode parameter (lxml 2.0 does), this is
       
   155         #       why we specify an encoding and re-decode to unicode later
       
   156         body = etree.tostring(xmltree[0], encoding=encoding)
   153         body = etree.tostring(xmltree[0], encoding=encoding)
   157         # remove <body> and </body> and decode to unicode
   154         # remove <body> and </body> and decode to unicode
   158         snippet = body[6:-7].decode(encoding)
   155         snippet = body[6:-7].decode(encoding)
   159         # take care to bad xhtml (for instance starting with </div>) which
   156         # take care to bad xhtml (for instance starting with </div>) which
   160         # may mess with the <div> we added below. Only remove it if it's
   157         # may mess with the <div> we added below. Only remove it if it's
   161         # still there...
   158         # still there...
   162         if snippet.startswith('<div>') and snippet.endswith('</div>'):
   159         if snippet.startswith('<div>') and snippet.endswith('</div>'):
   163             snippet = snippet[5:-6]
   160             snippet = snippet[5:-6]
   164         return snippet
   161         return snippet
   165 
   162 
   166     if hasattr(etree.HTML('<div>test</div>'), 'iter'):
   163         # lxml.Cleaner envelops text elements by internal logic (not accessible)
       
   164         # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
       
   165         # TODO drop attributes in elements
       
   166         # TODO add policy configuration (content only, embedded content, ...)
       
   167         # XXX this is buggy for "<p>text1</p><p>text2</p>"...
       
   168         # XXX drop these two snippets action and follow the lxml behaviour
       
   169         # XXX (tests need to be updated)
       
   170         # if snippet.startswith('<div>') and snippet.endswith('</div>'):
       
   171         #     snippet = snippet[5:-6]
       
   172         # if snippet.startswith('<p>') and snippet.endswith('</p>'):
       
   173         #     snippet = snippet[3:-4]
       
   174         return snippet.decode(encoding)
       
   175 
       
   176 except (ImportError, AttributeError):
       
   177     # gae environment: lxml not available
       
   178     # fallback implementation
       
   179     def soup2xhtml(data, encoding):
       
   180         # normalize line break
       
   181         # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
       
   182         return u'\n'.join(data.splitlines())
       
   183 else:
       
   184 
       
   185     if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary?
   167 
   186 
   168         def safe_cut(text, length):
   187         def safe_cut(text, length):
   169             """returns an html document of length <length> based on <text>,
   188             """returns an html document of length <length> based on <text>,
   170             and cut is necessary.
   189             and cut is necessary.
   171             """
   190             """
   339     return res
   358     return res
   340 
   359 
   341 # traceback formatting ########################################################
   360 # traceback formatting ########################################################
   342 
   361 
   343 import traceback
   362 import traceback
       
   363 
       
   364 def exc_message(ex, encoding):
       
   365     try:
       
   366         return unicode(ex)
       
   367     except:
       
   368         try:
       
   369             return unicode(str(ex), encoding, 'replace')
       
   370         except:
       
   371             return unicode(repr(ex), encoding, 'replace')
       
   372 
   344 
   373 
   345 def rest_traceback(info, exception):
   374 def rest_traceback(info, exception):
   346     """return a ReST formated traceback"""
   375     """return a ReST formated traceback"""
   347     res = [u'Traceback\n---------\n::\n']
   376     res = [u'Traceback\n---------\n::\n']
   348     for stackentry in traceback.extract_tb(info[2]):
   377     for stackentry in traceback.extract_tb(info[2]):