cleanup/fix cut variants
authorSylvain Thenault <sylvain.thenault@logilab.fr>
Wed, 07 Jan 2009 18:30:00 +0100
changeset 350 f34ef2c64605
parent 348 ebe40a8c7cc9
child 357 e1ba696130da
cleanup/fix cut variants
common/test/unittest_uilib.py
common/uilib.py
--- a/common/test/unittest_uilib.py	Wed Jan 07 16:12:45 2009 +0100
+++ b/common/test/unittest_uilib.py	Wed Jan 07 18:30:00 2009 +0100
@@ -22,112 +22,54 @@
             got = uilib.remove_html_tags(text)
             self.assertEquals(got, expected)
        
-    def test_safe_cut(self):
-        """ tests uilib.safe_cut() behaviour with very long text"""
+    def test_fallback_safe_cut(self):
+        self.assertEquals(uilib.fallback_safe_cut(u'ab <a href="hello">cd</a>', 4), u'ab c...')
+        self.assertEquals(uilib.fallback_safe_cut(u'ab <a href="hello">cd</a>', 5), u'ab <a href="hello">cd</a>')
+        self.assertEquals(uilib.fallback_safe_cut(u'ab <a href="hello">&amp;d</a>', 4), u'ab &amp;...')
+        self.assertEquals(uilib.fallback_safe_cut(u'ab <a href="hello">&amp;d</a> ef', 5), u'ab &amp;d...')
+        self.assertEquals(uilib.fallback_safe_cut(u'&amp; <a href="hello">&amp;d</a> ef', 4), u'&amp; &amp;d...')
         
-        data = [
-            ('opkolk', '<div><p>opkolk</p></div>'),
-            ("""<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
- tempor incididunt <strong>ut</strong> labore et dolore magna aliqua. Ut enim ad minim veniam,
- quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
- consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
- cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
- proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
- Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
- tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
- quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
- consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
- cillum dolore eu fugiat nulla pariatur.</p> ""","""<div><p>Lorem ipsum dolor sit amet, consectetur</p></div>"""),
-            ("""<p>empor incididunt utlabore et dolore magna aliqua. Ut enim ad minim veniam,
-quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
-consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
-cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
-proident, sunt in culpa qui officia d</p>""","""<div><p>empor incididunt utlabore et dolore magna aliqua.</p></div>"""),
-            ("""empor <strong>incididunt</strong> utlabore et dolore magna aliqua. Ut enim ad minim veniam,
-quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
-consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
-cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
-proident, sunt in culpa qui officia""","""<div><p>empor <strong>incididunt</strong> utlabore et dolore magna aliqua.</p></div>"""),
-            ("""<p>Lorem <strong>ipsum</strong> dolor <it>sit</it> amet, <strong>consectetur</strong> adipisicing elit, sed do eiusmod
- tempor incididunt <strong>ut</strong> labore et dolore magna aliqua. Ut enim ad minim veniam,
- quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
- consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
- cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
- proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
- Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
- tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
- quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
- consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
- cillum dolore eu fugiat nulla pariatur.</p>""","""<div><p>Lorem <strong>ipsum</strong> dolor <it>sit</it> amet, <strong>consectetur</strong></p></div>"""),
-            ("""&iexcl;""",u"""<div><p>\xa1</p></div>"""),
-            ("""<strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong>""",
-             u"""<div><strong>\xa1 \xa1 \xa1 \xa1</strong></div>"""),
-            ("""<strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong><strong>&iexcl; &iexcl; &iexcl; &iexcl;</strong>""",
-             u"""<div><strong>\xa1 \xa1 \xa1 \xa1</strong><strong>\xa1 \xa1 \xa1 \xa1</strong><strong>\xa1 \xa1 \xa1 \xa1</strong><strong>\xa1 \xa1 \xa1 \xa1</strong><strong>\xa1 \xa1 \xa1 \xa1</strong><strong>\xa1 \xa1 \xa1 \xa1</strong><strong>\xa1 \xa1 \xa1 \xa1</strong><strong>\xa1 \xa1 \xa1 \xa1</strong></div>"""),
-                      
-                       
-            ]
-        for text, expected in data:
-            got = uilib.safe_cut(text, 30)
-            self.assertEquals(got, expected)
+    def test_lxml_safe_cut(self):
+        self.assertEquals(uilib.safe_cut(u'aaa<div>aaad</div> ef', 4), u'<p>aaa</p><div>a...</div>')
+        self.assertEquals(uilib.safe_cut(u'aaa<div>aaad</div> ef', 7), u'<p>aaa</p><div>aaad</div>...')
+        self.assertEquals(uilib.safe_cut(u'aaa<div>aaad</div>', 7), u'<p>aaa</p><div>aaad</div>')
+        # Missing ellipsis due to space management but we don't care
+        self.assertEquals(uilib.safe_cut(u'ab <a href="hello">&amp;d</a>', 4), u'<p>ab <a href="hello">&amp;...</a></p>')
 
     def test_cut(self):
         """tests uilib.cut() behaviour"""
         data = [
             ('hello', 'hello'),
-            ('hello world', 'hello...'),
-            ("hell<b>O'</b> world", "hell<..."),
+            ('hello world', 'hello wo...'),
+            ("hell<b>O'</b> world", "hell<b>O..."),
             ]
         for text, expected in data:
             got = uilib.cut(text, 8)
             self.assertEquals(got, expected)
 
-    def test_text_cut_no_text(self):
+    def test_text_cut(self):
         """tests uilib.text_cut() behaviour with no text"""
-        data = [('','')]
-        for text, expected in data:
-            got = uilib.text_cut(text, 8)
-            self.assertEquals(got, expected)
-
-    def test_text_cut_long_text(self):
-        """tests uilib.text_cut() behaviour with long text"""
-        data = [("""Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
+        data = [('',''),
+                ("""Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
 tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
 quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
 consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
-cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
-proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
-tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
-quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
-consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
-cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
-proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-""","""Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
-tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
-quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
-consequat.""")]
-        for text, expected in data:
-            got = uilib.text_cut(text, 30)
-            self.assertEquals(got, expected)
-
-    def  test_text_cut_no_point(self):
-        """tests uilib.text_cut() behaviour with no point"""
-        data = [("""Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
+cillum dolore eu fugiat nulla pariatur.""",
+                 "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod \
+tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \
+quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo \
+consequat."),
+                ("""Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
 tempor incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam,
 quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
 consequat Duis aute irure dolor in reprehenderit in voluptate velit esse
 cillum dolore eu fugiat nulla pariatur Excepteur sint occaecat cupidatat non
 proident, sunt in culpa qui officia deserunt mollit anim id est laborum
-Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
-tempor incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam,
-quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
-consequat Duis aute irure dolor in reprehenderit in voluptate velit esse
-cillum dolore eu fugiat nulla pariatur Excepteur sint occaecat cupidatat non
-proident, sunt in culpa qui officia deserunt mollit anim id est laborum
-""","""Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
-tempor incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam,
-quis nostrud exercitation ullamco laboris nisi""")]
+""",
+                 "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod \
+tempor incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam, \
+quis nostrud exercitation ullamco laboris nisi"),
+                ]
         for text, expected in data:
             got = uilib.text_cut(text, 30)
             self.assertEquals(got, expected)
--- a/common/uilib.py	Wed Jan 07 16:12:45 2009 +0100
+++ b/common/uilib.py	Wed Jan 07 18:30:00 2009 +0100
@@ -15,7 +15,7 @@
 import re
 from urllib import quote as urlquote
 from cStringIO import StringIO
-from xml.parsers.expat import ExpatError
+from xml.sax.saxutils import unescape
 from copy import deepcopy
 
 import simplejson
@@ -23,6 +23,7 @@
 from mx.DateTime import DateTimeType, DateTimeDeltaType
 
 from logilab.common.textutils import unormalize
+from logilab.mtconverter import html_escape
 
 def ustrftime(date, fmt='%Y-%m-%d'):
     """like strftime, but returns a unicode string instead of an encoded
@@ -116,12 +117,15 @@
     tags from given text if cut is necessary."""
     if text is None:
         return u''
-    text_nohtml = remove_html_tags(text)
+    noenttext = unescape(text)
+    text_nohtml = remove_html_tags(noenttext)
     # try to keep html tags if text is short enough
     if len(text_nohtml) <= length:
         return text
     # else if un-tagged text is too long, cut it
-    return text_nohtml[:length-3] + u'...'
+    return html_escape(text_nohtml[:length] + u'...')
+
+fallback_safe_cut = safe_cut
 
 
 try:
@@ -152,40 +156,64 @@
             """
             if text is None:
                 return u''
-            textParse = etree.HTML(text)
-            compteur = 0
-
-            for element in textParse.iter():
-                if compteur > length:
+            dom = etree.HTML(text)
+            curlength = 0
+            add_ellipsis = False
+            for element in dom.iter():
+                if curlength >= length:
                     parent = element.getparent()
                     parent.remove(element)
+                    if curlength == length and (element.text or element.tail):
+                        add_ellipsis = True
                 else:
                     if element.text is not None:
-                        text_resum = text_cut_letters(element.text,length)
-                        len_text_resum = len(''.join(text_resum.split()))
-                        compteur = compteur + len_text_resum
-                        element.text = text_resum
-
+                        element.text = cut(element.text, length - curlength)
+                        curlength += len(element.text)
                     if element.tail is not None:
-                        if compteur < length:
-                            text_resum = text_cut_letters(element.tail,length)
-                            len_text_resum = len(''.join(text_resum.split()))
-                            compteur = compteur + len_text_resum
-                            element.tail = text_resum
+                        if curlength < length:
+                            element.tail = cut(element.tail, length - curlength)
+                            curlength += len(element.tail)
+                        elif curlength == length:
+                            element.tail = '...'
                         else:
                             element.tail = ''
+            text = etree.tounicode(dom[0])[6:-7] # remove wrapping <body></body>
+            if add_ellipsis:
+                return text + u'...'
+            return text
+        
+def text_cut(text, nbwords=30):
+    """from the given plain text, return a text with at least <nbwords> words,
+    trying to go to the end of the current sentence.
 
-            div = etree.HTML('<div></div>')[0][0]
-            listNode = textParse[0].getchildren()
-            for node in listNode:
-                div.append(deepcopy(node))
-            return etree.tounicode(div)
+    Note that spaces are normalized.
+    """
+    if text is None:
+        return u''
+    words = text.split()
+    text = ' '.join(words) # normalize spaces
+    minlength = len(' '.join(words[:nbwords]))
+    textlength = text.find('.', minlength) + 1
+    if textlength == 0: # no point found
+        textlength = minlength 
+    return text[:textlength]
+
+def cut(text, length):
+    """returns a string of a maximum length <length> based on <text>
+    (approximatively, since if text has been  cut, '...' is added to the end of the string,
+    resulting in a string of len <length> + 3)
+    """
+    if text is None:
+        return u''
+    if len(text) <= length:
+        return text
+    # else if un-tagged text is too long, cut it
+    return text[:length] + u'...'
+
 
     
 # HTML generation helper functions ############################################
 
-from logilab.mtconverter import html_escape
-
 def tooltipize(text, tooltip, url=None):
     """make an HTML tooltip"""
     url = url or '#'
@@ -221,41 +249,6 @@
         params.append('true')
     return "javascript: replacePageChunk(%s);" % ', '.join(params)
 
-def text_cut(text, nbwords=30):
-    if text is None:
-        return u''
-    minlength = len(' '.join(text.split()[:nbwords]))
-    textlength = text.find('.', minlength) + 1
-    if textlength == 0: # no point found
-        textlength = minlength 
-    return text[:textlength]
-
-def text_cut_letters(text, nbletters):
-    if text is None:
-        return u''
-    if len(''.join(text.split())) <= nbletters:
-           return text
-    else:
-        text_nospace = ''.join(text.split())
-        textlength=text.find('.') + 1
-
-        if textlength==0:
-           textlength=text.find(' ', nbletters+5)
-           
-        return text[:textlength] 
-
-def cut(text, length):
-    """returns a string of length <length> based on <text>
-    post:
-      len(__return__) <= length
-    """
-    if text is None:
-        return u''
-    if len(text) <= length:
-        return text
-    # else if un-tagged text is too long, cut it
-    return text[:length-3] + u'...'
-
 
 from StringIO import StringIO