[dataimport] ucsvreader should skip empty lines unless specified otherwise. Closes #3035944
--- a/dataimport.py Mon Jul 22 12:08:37 2013 +0200
+++ b/dataimport.py Wed Jul 24 11:54:35 2013 +0200
@@ -105,8 +105,8 @@
return i+1
def ucsvreader_pb(stream_or_path, encoding='utf-8', separator=',', quote='"',
- skipfirst=False, withpb=True):
- """same as ucsvreader but a progress bar is displayed as we iter on rows"""
+ skipfirst=False, withpb=True, skip_empty=True):
+ """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows"""
if isinstance(stream_or_path, basestring):
if not osp.exists(stream_or_path):
raise Exception("file doesn't exists: %s" % stream_or_path)
@@ -118,23 +118,30 @@
rowcount -= 1
if withpb:
pb = shellutils.ProgressBar(rowcount, 50)
- for urow in ucsvreader(stream, encoding, separator, quote, skipfirst):
+ for urow in ucsvreader(stream, encoding, separator, quote,
+ skipfirst=skipfirst, skip_empty=skip_empty):
yield urow
if withpb:
pb.update()
print ' %s rows imported' % rowcount
def ucsvreader(stream, encoding='utf-8', separator=',', quote='"',
- skipfirst=False, ignore_errors=False):
+ skipfirst=False, ignore_errors=False, skip_empty=True):
"""A csv reader that accepts files with any encoding and outputs unicode
strings
+
+ if skip_empty (the default), lines without any values specified (only
+ separators) will be skipped. This is useful for Excel exports which may be
+ full of such lines.
"""
it = iter(csv.reader(stream, delimiter=separator, quotechar=quote))
if not ignore_errors:
if skipfirst:
it.next()
for row in it:
- yield [item.decode(encoding) for item in row]
+ decoded = [item.decode(encoding) for item in row]
+ if not skip_empty or any(decoded):
+ yield [item.decode(encoding) for item in row]
else:
# Skip first line
try:
@@ -151,7 +158,10 @@
# Error in CSV, ignore line and continue
except csv.Error:
continue
- yield [item.decode(encoding) for item in row]
+ decoded = [item.decode(encoding) for item in row]
+ if not skip_empty or any(decoded):
+ yield decoded
+
def callfunc_every(func, number, iterable):
"""yield items of `iterable` one by one and call function `func`
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/unittest_dataimport.py Wed Jul 24 11:54:35 2013 +0200
@@ -0,0 +1,26 @@
+from StringIO import StringIO
+from logilab.common.testlib import TestCase, unittest_main
+from cubicweb import dataimport
+class UcsvreaderTC(TestCase):
+
+ def test_empty_lines_skipped(self):
+ stream = StringIO('''a,b,c,d,
+1,2,3,4,
+,,,,
+,,,,
+''')
+ self.assertEqual([[u'a', u'b', u'c', u'd', u''],
+ [u'1', u'2', u'3', u'4', u''],
+ ],
+ list(dataimport.ucsvreader(stream)))
+ stream.seek(0)
+ self.assertEqual([[u'a', u'b', u'c', u'd', u''],
+ [u'1', u'2', u'3', u'4', u''],
+ [u'', u'', u'', u'', u''],
+ [u'', u'', u'', u'', u'']
+ ],
+ list(dataimport.ucsvreader(stream, skip_empty=False)))
+
+
+if __name__ == '__main__':
+ unittest_main()