159 |
159 |
160 fallback_safe_cut = safe_cut |
160 fallback_safe_cut = safe_cut |
161 |
161 |
162 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U) |
162 REM_ROOT_HTML_TAGS = re.compile('</(body|html)>', re.U) |
163 |
163 |
164 try: |
164 from lxml import etree, html |
165 from lxml import etree, html |
165 from lxml.html import clean, defs |
166 from lxml.html import clean, defs |
166 |
167 |
167 ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags | |
168 ALLOWED_TAGS = (defs.general_block_tags | defs.list_tags | defs.table_tags | |
168 defs.phrase_tags | defs.font_style_tags | |
169 defs.phrase_tags | defs.font_style_tags | |
169 set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup')) |
170 set(('span', 'a', 'br', 'img', 'map', 'area', 'sub', 'sup')) |
170 ) |
171 ) |
171 |
172 |
172 CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False, |
173 CLEANER = clean.Cleaner(allow_tags=ALLOWED_TAGS, remove_unknown_tags=False, |
173 style=True, safe_attrs_only=True, |
174 style=True, safe_attrs_only=True, |
174 add_nofollow=False, |
175 add_nofollow=False, |
175 ) |
176 ) |
176 |
177 |
177 def soup2xhtml(data, encoding): |
178 def soup2xhtml(data, encoding): |
178 """tidy html soup by allowing some element tags and return the result |
179 """tidy html soup by allowing some element tags and return the result |
179 """ |
|
180 # remove spurious </body> and </html> tags, then normalize line break |
|
181 # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) |
|
182 data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) |
|
183 xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data)) |
|
184 # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt) |
|
185 # tried I got weird results (lxml 2.2.8) |
|
186 body = etree.tostring(xmltree[0], encoding=encoding) |
|
187 # remove <body> and </body> and decode to unicode |
|
188 snippet = body[6:-7].decode(encoding) |
|
189 # take care to bad xhtml (for instance starting with </div>) which |
|
190 # may mess with the <div> we added below. Only remove it if it's |
|
191 # still there... |
|
192 if snippet.startswith('<div>') and snippet.endswith('</div>'): |
|
193 snippet = snippet[5:-6] |
|
194 return snippet |
|
195 |
|
196 # lxml.Cleaner envelops text elements by internal logic (not accessible) |
|
197 # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 |
|
198 # TODO drop attributes in elements |
|
199 # TODO add policy configuration (content only, embedded content, ...) |
|
200 # XXX this is buggy for "<p>text1</p><p>text2</p>"... |
|
201 # XXX drop these two snippets action and follow the lxml behaviour |
|
202 # XXX (tests need to be updated) |
|
203 # if snippet.startswith('<div>') and snippet.endswith('</div>'): |
|
204 # snippet = snippet[5:-6] |
|
205 # if snippet.startswith('<p>') and snippet.endswith('</p>'): |
|
206 # snippet = snippet[3:-4] |
|
207 return snippet.decode(encoding) |
|
208 |
|
209 if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary? |
|
210 # pylint: disable=E0102 |
|
211 def safe_cut(text, length): |
|
212 """returns an html document of length <length> based on <text>, |
|
213 and cut is necessary. |
180 """ |
214 """ |
181 # remove spurious </body> and </html> tags, then normalize line break |
215 if text is None: |
182 # (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1) |
216 return u'' |
183 data = REM_ROOT_HTML_TAGS.sub('', u'\n'.join(data.splitlines())) |
217 dom = etree.HTML(text) |
184 xmltree = etree.HTML(CLEANER.clean_html('<div>%s</div>' % data)) |
218 curlength = 0 |
185 # NOTE: lxml 2.0 does support encoding='unicode', but last time I (syt) |
219 add_ellipsis = False |
186 # tried I got weird results (lxml 2.2.8) |
220 for element in dom.iter(): |
187 body = etree.tostring(xmltree[0], encoding=encoding) |
221 if curlength >= length: |
188 # remove <body> and </body> and decode to unicode |
222 parent = element.getparent() |
189 snippet = body[6:-7].decode(encoding) |
223 parent.remove(element) |
190 # take care to bad xhtml (for instance starting with </div>) which |
224 if curlength == length and (element.text or element.tail): |
191 # may mess with the <div> we added below. Only remove it if it's |
225 add_ellipsis = True |
192 # still there... |
226 else: |
193 if snippet.startswith('<div>') and snippet.endswith('</div>'): |
227 if element.text is not None: |
194 snippet = snippet[5:-6] |
228 element.text = cut(element.text, length - curlength) |
195 return snippet |
229 curlength += len(element.text) |
196 |
230 if element.tail is not None: |
197 # lxml.Cleaner envelops text elements by internal logic (not accessible) |
231 if curlength < length: |
198 # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 |
232 element.tail = cut(element.tail, length - curlength) |
199 # TODO drop attributes in elements |
233 curlength += len(element.tail) |
200 # TODO add policy configuration (content only, embedded content, ...) |
234 elif curlength == length: |
201 # XXX this is buggy for "<p>text1</p><p>text2</p>"... |
235 element.tail = '...' |
202 # XXX drop these two snippets action and follow the lxml behaviour |
236 else: |
203 # XXX (tests need to be updated) |
237 element.tail = '' |
204 # if snippet.startswith('<div>') and snippet.endswith('</div>'): |
238 text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body> |
205 # snippet = snippet[5:-6] |
239 if add_ellipsis: |
206 # if snippet.startswith('<p>') and snippet.endswith('</p>'): |
240 return text + u'...' |
207 # snippet = snippet[3:-4] |
241 return text |
208 return snippet.decode(encoding) |
|
209 |
|
210 except (ImportError, AttributeError): |
|
211 # gae environment: lxml not available |
|
212 # fallback implementation |
|
213 def soup2xhtml(data, encoding): |
|
214 # normalize line break |
|
215 # see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 |
|
216 return u'\n'.join(data.splitlines()) |
|
217 else: |
|
218 |
|
219 if hasattr(etree.HTML('<div>test</div>'), 'iter'): # XXX still necessary? |
|
220 |
|
221 def safe_cut(text, length): |
|
222 """returns an html document of length <length> based on <text>, |
|
223 and cut is necessary. |
|
224 """ |
|
225 if text is None: |
|
226 return u'' |
|
227 dom = etree.HTML(text) |
|
228 curlength = 0 |
|
229 add_ellipsis = False |
|
230 for element in dom.iter(): |
|
231 if curlength >= length: |
|
232 parent = element.getparent() |
|
233 parent.remove(element) |
|
234 if curlength == length and (element.text or element.tail): |
|
235 add_ellipsis = True |
|
236 else: |
|
237 if element.text is not None: |
|
238 element.text = cut(element.text, length - curlength) |
|
239 curlength += len(element.text) |
|
240 if element.tail is not None: |
|
241 if curlength < length: |
|
242 element.tail = cut(element.tail, length - curlength) |
|
243 curlength += len(element.tail) |
|
244 elif curlength == length: |
|
245 element.tail = '...' |
|
246 else: |
|
247 element.tail = '' |
|
248 text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body> |
|
249 if add_ellipsis: |
|
250 return text + u'...' |
|
251 return text |
|
252 |
242 |
253 def text_cut(text, nbwords=30, gotoperiod=True): |
243 def text_cut(text, nbwords=30, gotoperiod=True): |
254 """from the given plain text, return a text with at least <nbwords> words, |
244 """from the given plain text, return a text with at least <nbwords> words, |
255 trying to go to the end of the current sentence. |
245 trying to go to the end of the current sentence. |
256 |
246 |