cubicweb/dataimport/csv.py
changeset 11057 0b59724cb3f2
parent 10907 9ae707db5265
child 11313 682b15eb2dd2
equal deleted inserted replaced
11052:058bb3dc685f 11057:0b59724cb3f2
       
     1 # copyright 2003-2015 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
       
     2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
       
     3 #
       
     4 # This file is part of CubicWeb.
       
     5 #
       
     6 # CubicWeb is free software: you can redistribute it and/or modify it under the
       
     7 # terms of the GNU Lesser General Public License as published by the Free
       
     8 # Software Foundation, either version 2.1 of the License, or (at your option)
       
     9 # any later version.
       
    10 #
       
    11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT
       
    12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
       
    13 # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
       
    14 # details.
       
    15 #
       
    16 # You should have received a copy of the GNU Lesser General Public License along
       
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
       
    18 """Functions to help importing CSV data"""
       
    19 from __future__ import absolute_import, print_function
       
    20 
       
    21 import codecs
       
    22 import csv as csvmod
       
    23 import warnings
       
    24 import os.path as osp
       
    25 
       
    26 from six import PY2, PY3, string_types
       
    27 
       
    28 from logilab.common import shellutils
       
    29 
       
    30 
       
    31 def count_lines(stream_or_filename):
       
    32     if isinstance(stream_or_filename, string_types):
       
    33         f = open(stream_or_filename)
       
    34     else:
       
    35         f = stream_or_filename
       
    36         f.seek(0)
       
    37     for i, line in enumerate(f):
       
    38         pass
       
    39     f.seek(0)
       
    40     return i+1
       
    41 
       
    42 
       
    43 def ucsvreader_pb(stream_or_path, encoding='utf-8', delimiter=',', quotechar='"',
       
    44                   skipfirst=False, withpb=True, skip_empty=True, separator=None,
       
    45                   quote=None):
       
    46     """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows"""
       
    47     if separator is not None:
       
    48         delimiter = separator
       
    49         warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead")
       
    50     if quote is not None:
       
    51         quotechar = quote
       
    52         warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead")
       
    53     if isinstance(stream_or_path, string_types):
       
    54         stream = open(stream_or_path, 'rb')
       
    55     else:
       
    56         stream = stream_or_path
       
    57     rowcount = count_lines(stream)
       
    58     if skipfirst:
       
    59         rowcount -= 1
       
    60     if withpb:
       
    61         pb = shellutils.ProgressBar(rowcount, 50)
       
    62     for urow in ucsvreader(stream, encoding, delimiter, quotechar,
       
    63                            skipfirst=skipfirst, skip_empty=skip_empty):
       
    64         yield urow
       
    65         if withpb:
       
    66             pb.update()
       
    67     print(' %s rows imported' % rowcount)
       
    68 
       
    69 
       
    70 def ucsvreader(stream, encoding='utf-8', delimiter=',', quotechar='"',
       
    71                skipfirst=False, ignore_errors=False, skip_empty=True,
       
    72                separator=None, quote=None):
       
    73     """A csv reader that accepts files with any encoding and outputs unicode
       
    74     strings
       
    75 
       
    76     if skip_empty (the default), lines without any values specified (only
       
    77     separators) will be skipped. This is useful for Excel exports which may be
       
    78     full of such lines.
       
    79     """
       
    80     if PY3:
       
    81         stream = codecs.getreader(encoding)(stream)
       
    82     if separator is not None:
       
    83         delimiter = separator
       
    84         warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead")
       
    85     if quote is not None:
       
    86         quotechar = quote
       
    87         warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead")
       
    88     it = iter(csvmod.reader(stream, delimiter=delimiter, quotechar=quotechar))
       
    89     if not ignore_errors:
       
    90         if skipfirst:
       
    91             next(it)
       
    92         for row in it:
       
    93             if PY2:
       
    94                 decoded = [item.decode(encoding) for item in row]
       
    95             else:
       
    96                 decoded = row
       
    97             if not skip_empty or any(decoded):
       
    98                 yield decoded
       
    99     else:
       
   100         if skipfirst:
       
   101             try:
       
   102                 row = next(it)
       
   103             except csvmod.Error:
       
   104                 pass
       
   105         # Safe version, that can cope with error in CSV file
       
   106         while True:
       
   107             try:
       
   108                 row = next(it)
       
   109             # End of CSV, break
       
   110             except StopIteration:
       
   111                 break
       
   112             # Error in CSV, ignore line and continue
       
   113             except csvmod.Error:
       
   114                 continue
       
   115             if PY2:
       
   116                 decoded = [item.decode(encoding) for item in row]
       
   117             else:
       
   118                 decoded = row
       
   119             if not skip_empty or any(decoded):
       
   120                 yield decoded