dataimport/csv.py
changeset 10513 7bec01a59f92
child 10589 7c23b7de2b8d
child 11404 98eebbe3de23
equal deleted inserted replaced
10512:99bdd4bddd77 10513:7bec01a59f92
       
     1 # copyright 2003-2015 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
       
     2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
       
     3 #
       
     4 # This file is part of CubicWeb.
       
     5 #
       
     6 # CubicWeb is free software: you can redistribute it and/or modify it under the
       
     7 # terms of the GNU Lesser General Public License as published by the Free
       
     8 # Software Foundation, either version 2.1 of the License, or (at your option)
       
     9 # any later version.
       
    10 #
       
    11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT
       
    12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
       
    13 # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
       
    14 # details.
       
    15 #
       
    16 # You should have received a copy of the GNU Lesser General Public License along
       
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
       
    18 """Functions to help importing CSV data"""
       
    19 
       
    20 from __future__ import absolute_import
       
    21 
       
    22 import csv as csvmod
       
    23 import warnings
       
    24 import os.path as osp
       
    25 
       
    26 from logilab.common import shellutils
       
    27 
       
    28 
       
    29 def count_lines(stream_or_filename):
       
    30     if isinstance(stream_or_filename, basestring):
       
    31         f = open(stream_or_filename)
       
    32     else:
       
    33         f = stream_or_filename
       
    34         f.seek(0)
       
    35     for i, line in enumerate(f):
       
    36         pass
       
    37     f.seek(0)
       
    38     return i+1
       
    39 
       
    40 
       
    41 def ucsvreader_pb(stream_or_path, encoding='utf-8', delimiter=',', quotechar='"',
       
    42                   skipfirst=False, withpb=True, skip_empty=True, separator=None,
       
    43                   quote=None):
       
    44     """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows"""
       
    45     if separator is not None:
       
    46         delimiter = separator
       
    47         warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead")
       
    48     if quote is not None:
       
    49         quotechar = quote
       
    50         warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead")
       
    51     if isinstance(stream_or_path, basestring):
       
    52         if not osp.exists(stream_or_path):
       
    53             raise Exception("file doesn't exists: %s" % stream_or_path)
       
    54         stream = open(stream_or_path)
       
    55     else:
       
    56         stream = stream_or_path
       
    57     rowcount = count_lines(stream)
       
    58     if skipfirst:
       
    59         rowcount -= 1
       
    60     if withpb:
       
    61         pb = shellutils.ProgressBar(rowcount, 50)
       
    62     for urow in ucsvreader(stream, encoding, delimiter, quotechar,
       
    63                            skipfirst=skipfirst, skip_empty=skip_empty):
       
    64         yield urow
       
    65         if withpb:
       
    66             pb.update()
       
    67     print ' %s rows imported' % rowcount
       
    68 
       
    69 
       
    70 def ucsvreader(stream, encoding='utf-8', delimiter=',', quotechar='"',
       
    71                skipfirst=False, ignore_errors=False, skip_empty=True,
       
    72                separator=None, quote=None):
       
    73     """A csv reader that accepts files with any encoding and outputs unicode
       
    74     strings
       
    75 
       
    76     if skip_empty (the default), lines without any values specified (only
       
    77     separators) will be skipped. This is useful for Excel exports which may be
       
    78     full of such lines.
       
    79     """
       
    80     if separator is not None:
       
    81         delimiter = separator
       
    82         warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead")
       
    83     if quote is not None:
       
    84         quotechar = quote
       
    85         warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead")
       
    86     it = iter(csvmod.reader(stream, delimiter=delimiter, quotechar=quotechar))
       
    87     if not ignore_errors:
       
    88         if skipfirst:
       
    89             it.next()
       
    90         for row in it:
       
    91             decoded = [item.decode(encoding) for item in row]
       
    92             if not skip_empty or any(decoded):
       
    93                 yield decoded
       
    94     else:
       
    95         if skipfirst:
       
    96             try:
       
    97                 row = it.next()
       
    98             except csvmod.Error:
       
    99                 pass
       
   100         # Safe version, that can cope with error in CSV file
       
   101         while True:
       
   102             try:
       
   103                 row = it.next()
       
   104             # End of CSV, break
       
   105             except StopIteration:
       
   106                 break
       
   107             # Error in CSV, ignore line and continue
       
   108             except csvmod.Error:
       
   109                 continue
       
   110             decoded = [item.decode(encoding) for item in row]
       
   111             if not skip_empty or any(decoded):
       
   112                 yield decoded
       
   113