dataimport/csv.py
changeset 10513 7bec01a59f92
child 10589 7c23b7de2b8d
child 11404 98eebbe3de23
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dataimport/csv.py	Fri Jun 26 11:15:25 2015 +0200
@@ -0,0 +1,113 @@
+# copyright 2003-2015 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
+#
+# This file is part of CubicWeb.
+#
+# CubicWeb is free software: you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
+"""Functions to help importing CSV data"""
+
+from __future__ import absolute_import
+
+import csv as csvmod
+import warnings
+import os.path as osp
+
+from logilab.common import shellutils
+
+
+def count_lines(stream_or_filename):
+    if isinstance(stream_or_filename, basestring):
+        f = open(stream_or_filename)
+    else:
+        f = stream_or_filename
+        f.seek(0)
+    for i, line in enumerate(f):
+        pass
+    f.seek(0)
+    return i+1
+
+
+def ucsvreader_pb(stream_or_path, encoding='utf-8', delimiter=',', quotechar='"',
+                  skipfirst=False, withpb=True, skip_empty=True, separator=None,
+                  quote=None):
+    """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows"""
+    if separator is not None:
+        delimiter = separator
+        warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead")
+    if quote is not None:
+        quotechar = quote
+        warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead")
+    if isinstance(stream_or_path, basestring):
+        if not osp.exists(stream_or_path):
+            raise Exception("file doesn't exists: %s" % stream_or_path)
+        stream = open(stream_or_path)
+    else:
+        stream = stream_or_path
+    rowcount = count_lines(stream)
+    if skipfirst:
+        rowcount -= 1
+    if withpb:
+        pb = shellutils.ProgressBar(rowcount, 50)
+    for urow in ucsvreader(stream, encoding, delimiter, quotechar,
+                           skipfirst=skipfirst, skip_empty=skip_empty):
+        yield urow
+        if withpb:
+            pb.update()
+    print ' %s rows imported' % rowcount
+
+
+def ucsvreader(stream, encoding='utf-8', delimiter=',', quotechar='"',
+               skipfirst=False, ignore_errors=False, skip_empty=True,
+               separator=None, quote=None):
+    """A csv reader that accepts files with any encoding and outputs unicode
+    strings
+
+    if skip_empty (the default), lines without any values specified (only
+    separators) will be skipped. This is useful for Excel exports which may be
+    full of such lines.
+    """
+    if separator is not None:
+        delimiter = separator
+        warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead")
+    if quote is not None:
+        quotechar = quote
+        warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead")
+    it = iter(csvmod.reader(stream, delimiter=delimiter, quotechar=quotechar))
+    if not ignore_errors:
+        if skipfirst:
+            it.next()
+        for row in it:
+            decoded = [item.decode(encoding) for item in row]
+            if not skip_empty or any(decoded):
+                yield decoded
+    else:
+        if skipfirst:
+            try:
+                row = it.next()
+            except csvmod.Error:
+                pass
+        # Safe version, that can cope with error in CSV file
+        while True:
+            try:
+                row = it.next()
+            # End of CSV, break
+            except StopIteration:
+                break
+            # Error in CSV, ignore line and continue
+            except csvmod.Error:
+                continue
+            decoded = [item.decode(encoding) for item in row]
+            if not skip_empty or any(decoded):
+                yield decoded
+