[dataimport] ucsvreader should skip empty lines unless specified otherwise. Closes #3035944 stable
authorSylvain Thénault <sylvain.thenault@logilab.fr>
Wed, 24 Jul 2013 11:54:35 +0200
branchstable
changeset 9181 2eac0aa1d3f6
parent 9180 13ed6de41774
child 9182 75493f6ca586
[dataimport] ucsvreader should skip empty lines unless specified otherwise. Closes #3035944
dataimport.py
test/unittest_dataimport.py
--- a/dataimport.py	Mon Jul 22 12:08:37 2013 +0200
+++ b/dataimport.py	Wed Jul 24 11:54:35 2013 +0200
@@ -105,8 +105,8 @@
     return i+1
 
 def ucsvreader_pb(stream_or_path, encoding='utf-8', separator=',', quote='"',
-                  skipfirst=False, withpb=True):
-    """same as ucsvreader but a progress bar is displayed as we iter on rows"""
+                  skipfirst=False, withpb=True, skip_empty=True):
+    """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows"""
     if isinstance(stream_or_path, basestring):
         if not osp.exists(stream_or_path):
             raise Exception("file doesn't exists: %s" % stream_or_path)
@@ -118,23 +118,30 @@
         rowcount -= 1
     if withpb:
         pb = shellutils.ProgressBar(rowcount, 50)
-    for urow in ucsvreader(stream, encoding, separator, quote, skipfirst):
+    for urow in ucsvreader(stream, encoding, separator, quote,
+                           skipfirst=skipfirst, skip_empty=skip_empty):
         yield urow
         if withpb:
             pb.update()
     print ' %s rows imported' % rowcount
 
 def ucsvreader(stream, encoding='utf-8', separator=',', quote='"',
-               skipfirst=False, ignore_errors=False):
+               skipfirst=False, ignore_errors=False, skip_empty=True):
     """A csv reader that accepts files with any encoding and outputs unicode
     strings
+
+    if skip_empty (the default), lines without any values specified (only
+    separators) will be skipped. This is useful for Excel exports which may be
+    full of such lines.
     """
     it = iter(csv.reader(stream, delimiter=separator, quotechar=quote))
     if not ignore_errors:
         if skipfirst:
             it.next()
         for row in it:
-            yield [item.decode(encoding) for item in row]
+            decoded = [item.decode(encoding) for item in row]
+            if not skip_empty or any(decoded):
+                yield [item.decode(encoding) for item in row]
     else:
         # Skip first line
         try:
@@ -151,7 +158,10 @@
             # Error in CSV, ignore line and continue
             except csv.Error:
                 continue
-            yield [item.decode(encoding) for item in row]
+            decoded = [item.decode(encoding) for item in row]
+            if not skip_empty or any(decoded):
+                yield decoded
+
 
 def callfunc_every(func, number, iterable):
     """yield items of `iterable` one by one and call function `func`
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/unittest_dataimport.py	Wed Jul 24 11:54:35 2013 +0200
@@ -0,0 +1,26 @@
+from StringIO import StringIO
+from logilab.common.testlib import TestCase, unittest_main
+from cubicweb import dataimport
+class UcsvreaderTC(TestCase):
+
+    def test_empty_lines_skipped(self):
+        stream = StringIO('''a,b,c,d,
+1,2,3,4,
+,,,,
+,,,,
+''')
+        self.assertEqual([[u'a', u'b', u'c', u'd', u''],
+                          [u'1', u'2', u'3', u'4', u''],
+                          ],
+                         list(dataimport.ucsvreader(stream)))
+        stream.seek(0)
+        self.assertEqual([[u'a', u'b', u'c', u'd', u''],
+                          [u'1', u'2', u'3', u'4', u''],
+                          [u'', u'', u'', u'', u''],
+                          [u'', u'', u'', u'', u'']
+                          ],
+                         list(dataimport.ucsvreader(stream, skip_empty=False)))
+
+
+if __name__ == '__main__':
+    unittest_main()