107 """replace <ref rql=''> links by <a href="...">""" |
107 """replace <ref rql=''> links by <a href="...">""" |
108 if not text: |
108 if not text: |
109 return u'' |
109 return u'' |
110 return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text) |
110 return REF_PROG.sub(lambda obj, view=view:_subst_rql(view, obj), text) |
111 |
111 |
112 # fallback implementation, nicer one defined below if lxml is available |
|
113 def soup2xhtml(data, encoding): |
|
114 # normalize line break |
|
115 # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 |
|
116 return u'\n'.join(data.splitlines()) |
|
117 |
|
118 # fallback implementation, nicer one defined below if lxml> 2.0 is available |
112 # fallback implementation, nicer one defined below if lxml> 2.0 is available |
119 def safe_cut(text, length): |
113 def safe_cut(text, length): |
120 """returns a string of length <length> based on <text>, removing any html |
114 """returns a string of length <length> based on <text>, removing any html |
121 tags from given text if cut is necessary.""" |
115 tags from given text if cut is necessary.""" |
122 if text is None: |
116 if text is None: |
130 return xml_escape(text_nohtml[:length] + u'...') |
124 return xml_escape(text_nohtml[:length] + u'...') |
131 |
125 |
132 fallback_safe_cut = safe_cut |
126 fallback_safe_cut = safe_cut |
133 |
127 |
134 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U) |
128 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U) |
|
129 |
135 try: |
130 try: |
136 from lxml import etree |
131 from lxml import etree, html |
137 except (ImportError, AttributeError): |
132 from lxml.html import clean, defs |
138 # gae environment: lxml not available |
133 |
139 pass |
134 ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags | |
140 else: |
135 defs.phrase_tags | defs.font_style_tags | |
|
136 set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup')) |
|
137 ) |
|
138 |
|
139 CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False, |
|
140 style=True, safe_attrs_only=True, |
|
141 add_nofollow=False, |
|
142 ) |
141 |
143 |
142 def soup2xhtml(data, encoding): |
144 def soup2xhtml(data, encoding): |
143 """tidy (at least try) html soup and return the result |
145 """tidy html soup by allowing some element tags and return the result |
144 |
|
145 Note: the function considers a string with no surrounding tag as valid |
|
146 if <div>`data`</div> can be parsed by an XML parser |
|
147 """ |
146 """ |
148 # remove spurious </body> and </html> tags, then normalize line break |
147 # remove spurious </body> and </html> tags, then normalize line break |
149 # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) |
148 # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) |
150 data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) |
149 data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) |
151 # XXX lxml 1.1 support still needed ? |
150 xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data)) |
152 xmltree = etree.HTML('<div>%s</div>' % data) |
151 # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt) |
153 # NOTE: lxml 1.1 (etch platforms) doesn't recognize |
152 # tried I got weird results (lxml 2.2.8) |
154 # the encoding=unicode parameter (lxml 2.0 does), this is |
|
155 # why we specify an encoding and re-decode to unicode later |
|
156 body = etree.tostring(xmltree[0], encoding=encoding) |
153 body = etree.tostring(xmltree[0], encoding=encoding) |
157 # remove <body> and </body> and decode to unicode |
154 # remove <body> and </body> and decode to unicode |
158 snippet = body[6:-7].decode(encoding) |
155 snippet = body[6:-7].decode(encoding) |
159 # take care to bad xhtml (for instance starting with </div>) which |
156 # take care to bad xhtml (for instance starting with </div>) which |
160 # may mess with the <div> we added below. Only remove it if it's |
157 # may mess with the <div> we added below. Only remove it if it's |
161 # still there... |
158 # still there... |
162 if snippet.startswith('<div>') and snippet.endswith('</div>'): |
159 if snippet.startswith('<div>') and snippet.endswith('</div>'): |
163 snippet = snippet[5:-6] |
160 snippet = snippet[5:-6] |
164 return snippet |
161 return snippet |
165 |
162 |
166 if hasattr(etree.HTML('<div>test</div>'), 'iter'): |
163 # lxml.Cleaner envelops text elements by internal logic (not accessible) |
|
164 # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 |
|
165 # TODO drop attributes in elements |
|
166 # TODO add policy configuration (content only, embedded content, ...) |
|
167 # XXX this is buggy for "<p>text1</p><p>text2</p>"... |
|
168 # XXX drop these two snippets action and follow the lxml behaviour |
|
169 # XXX (tests need to be updated) |
|
170 # if snippet.startswith('<div>') and snippet.endswith('</div>'): |
|
171 # snippet = snippet[5:-6] |
|
172 # if snippet.startswith('<p>') and snippet.endswith('</p>'): |
|
173 # snippet = snippet[3:-4] |
|
174 return snippet.decode(encoding) |
|
175 |
|
176 except (ImportError, AttributeError): |
|
177 # gae environment: lxml not available |
|
178 # fallback implementation |
|
179 def soup2xhtml(data, encoding): |
|
180 # normalize line break |
|
181 # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 |
|
182 return u'\n'.join(data.splitlines()) |
|
183 else: |
|
184 |
|
185 if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary? |
167 |
186 |
168 def safe_cut(text, length): |
187 def safe_cut(text, length): |
169 """returns an html document of length <length> based on <text>, |
188 """returns an html document of length <length> based on <text>, |
170 and cut is necessary. |
189 and cut is necessary. |
171 """ |
190 """ |