# HG changeset patch # User Sylvain Thénault # Date 1374659675 -7200 # Node ID 2eac0aa1d3f6e1d9bfae656ba1345095bba03094 # Parent 13ed6de4177473b3ef8a99a1b94873729eaa8a1c [dataimport] ucsvreader should skip empty lines unless specified otherwise. Closes #3035944 diff -r 13ed6de41774 -r 2eac0aa1d3f6 dataimport.py --- a/dataimport.py Mon Jul 22 12:08:37 2013 +0200 +++ b/dataimport.py Wed Jul 24 11:54:35 2013 +0200 @@ -105,8 +105,8 @@ return i+1 def ucsvreader_pb(stream_or_path, encoding='utf-8', separator=',', quote='"', - skipfirst=False, withpb=True): - """same as ucsvreader but a progress bar is displayed as we iter on rows""" + skipfirst=False, withpb=True, skip_empty=True): + """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows""" if isinstance(stream_or_path, basestring): if not osp.exists(stream_or_path): raise Exception("file doesn't exists: %s" % stream_or_path) @@ -118,23 +118,30 @@ rowcount -= 1 if withpb: pb = shellutils.ProgressBar(rowcount, 50) - for urow in ucsvreader(stream, encoding, separator, quote, skipfirst): + for urow in ucsvreader(stream, encoding, separator, quote, + skipfirst=skipfirst, skip_empty=skip_empty): yield urow if withpb: pb.update() print ' %s rows imported' % rowcount def ucsvreader(stream, encoding='utf-8', separator=',', quote='"', - skipfirst=False, ignore_errors=False): + skipfirst=False, ignore_errors=False, skip_empty=True): """A csv reader that accepts files with any encoding and outputs unicode strings + + if skip_empty (the default), lines without any values specified (only + separators) will be skipped. This is useful for Excel exports which may be + full of such lines. """ it = iter(csv.reader(stream, delimiter=separator, quotechar=quote)) if not ignore_errors: if skipfirst: it.next() for row in it: - yield [item.decode(encoding) for item in row] + decoded = [item.decode(encoding) for item in row] + if not skip_empty or any(decoded): + yield [item.decode(encoding) for item in row] else: # Skip first line try: @@ -151,7 +158,10 @@ # Error in CSV, ignore line and continue except csv.Error: continue - yield [item.decode(encoding) for item in row] + decoded = [item.decode(encoding) for item in row] + if not skip_empty or any(decoded): + yield decoded + def callfunc_every(func, number, iterable): """yield items of `iterable` one by one and call function `func` diff -r 13ed6de41774 -r 2eac0aa1d3f6 test/unittest_dataimport.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/unittest_dataimport.py Wed Jul 24 11:54:35 2013 +0200 @@ -0,0 +1,26 @@ +from StringIO import StringIO +from logilab.common.testlib import TestCase, unittest_main +from cubicweb import dataimport +class UcsvreaderTC(TestCase): + + def test_empty_lines_skipped(self): + stream = StringIO('''a,b,c,d, +1,2,3,4, +,,,, +,,,, +''') + self.assertEqual([[u'a', u'b', u'c', u'd', u''], + [u'1', u'2', u'3', u'4', u''], + ], + list(dataimport.ucsvreader(stream))) + stream.seek(0) + self.assertEqual([[u'a', u'b', u'c', u'd', u''], + [u'1', u'2', u'3', u'4', u''], + [u'', u'', u'', u'', u''], + [u'', u'', u'', u'', u''] + ], + list(dataimport.ucsvreader(stream, skip_empty=False))) + + +if __name__ == '__main__': + unittest_main()