dataimport/csv.py
author Julien Cristau <julien.cristau@logilab.fr>
Mon, 09 Nov 2015 15:22:43 +0100
changeset 10869 575982c948a9
parent 10807 bb0c7dbd1fe7
child 10907 9ae707db5265
permissions -rw-r--r--
[dataimport] remove drop_index parameter from massive store "drop_index=False" also implied not dropping any constraints, in particular foreign keys, which meant any attempt to import entities would fail, because we only add metadata (the entities table) after the entity's insertion.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     1
# copyright 2003-2015 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     2
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     3
#
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     4
# This file is part of CubicWeb.
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     5
#
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     6
# CubicWeb is free software: you can redistribute it and/or modify it under the
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     7
# terms of the GNU Lesser General Public License as published by the Free
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     8
# Software Foundation, either version 2.1 of the License, or (at your option)
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     9
# any later version.
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    10
#
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    11
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    12
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    13
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    14
# details.
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    15
#
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    16
# You should have received a copy of the GNU Lesser General Public License along
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    17
# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    18
"""Functions to help importing CSV data"""
10589
7c23b7de2b8d [py3k] print function
Samuel Trégouët <samuel.tregouet@logilab.fr>
parents: 10513
diff changeset
    19
from __future__ import absolute_import, print_function
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    20
10807
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    21
import codecs
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    22
import csv as csvmod
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    23
import warnings
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    24
import os.path as osp
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    25
10807
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    26
from six import PY2, PY3, string_types
10612
84468b90e9c1 [py3k] basestring → six.string_types
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10589
diff changeset
    27
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    28
from logilab.common import shellutils
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    29
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    30
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    31
def count_lines(stream_or_filename):
10612
84468b90e9c1 [py3k] basestring → six.string_types
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10589
diff changeset
    32
    if isinstance(stream_or_filename, string_types):
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    33
        f = open(stream_or_filename)
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    34
    else:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    35
        f = stream_or_filename
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    36
        f.seek(0)
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    37
    for i, line in enumerate(f):
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    38
        pass
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    39
    f.seek(0)
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    40
    return i+1
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    41
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    42
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    43
def ucsvreader_pb(stream_or_path, encoding='utf-8', delimiter=',', quotechar='"',
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    44
                  skipfirst=False, withpb=True, skip_empty=True, separator=None,
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    45
                  quote=None):
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    46
    """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows"""
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    47
    if separator is not None:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    48
        delimiter = separator
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    49
        warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead")
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    50
    if quote is not None:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    51
        quotechar = quote
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    52
        warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead")
10612
84468b90e9c1 [py3k] basestring → six.string_types
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10589
diff changeset
    53
    if isinstance(stream_or_path, string_types):
10807
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    54
        stream = open(stream_or_path, 'rb')
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    55
    else:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    56
        stream = stream_or_path
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    57
    rowcount = count_lines(stream)
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    58
    if skipfirst:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    59
        rowcount -= 1
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    60
    if withpb:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    61
        pb = shellutils.ProgressBar(rowcount, 50)
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    62
    for urow in ucsvreader(stream, encoding, delimiter, quotechar,
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    63
                           skipfirst=skipfirst, skip_empty=skip_empty):
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    64
        yield urow
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    65
        if withpb:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    66
            pb.update()
10589
7c23b7de2b8d [py3k] print function
Samuel Trégouët <samuel.tregouet@logilab.fr>
parents: 10513
diff changeset
    67
    print(' %s rows imported' % rowcount)
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    68
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    69
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    70
def ucsvreader(stream, encoding='utf-8', delimiter=',', quotechar='"',
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    71
               skipfirst=False, ignore_errors=False, skip_empty=True,
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    72
               separator=None, quote=None):
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    73
    """A csv reader that accepts files with any encoding and outputs unicode
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    74
    strings
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    75
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    76
    if skip_empty (the default), lines without any values specified (only
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    77
    separators) will be skipped. This is useful for Excel exports which may be
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    78
    full of such lines.
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    79
    """
10807
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    80
    if PY3:
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    81
        stream = codecs.getreader(encoding)(stream)
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    82
    if separator is not None:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    83
        delimiter = separator
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    84
        warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead")
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    85
    if quote is not None:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    86
        quotechar = quote
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    87
        warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead")
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    88
    it = iter(csvmod.reader(stream, delimiter=delimiter, quotechar=quotechar))
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    89
    if not ignore_errors:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    90
        if skipfirst:
10669
155c29e0ed1c [py3k] use next builtin instead of next method
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10612
diff changeset
    91
            next(it)
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    92
        for row in it:
10807
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    93
            if PY2:
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    94
                decoded = [item.decode(encoding) for item in row]
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    95
            else:
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
    96
                decoded = row
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    97
            if not skip_empty or any(decoded):
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    98
                yield decoded
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    99
    else:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   100
        if skipfirst:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   101
            try:
10669
155c29e0ed1c [py3k] use next builtin instead of next method
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10612
diff changeset
   102
                row = next(it)
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   103
            except csvmod.Error:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   104
                pass
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   105
        # Safe version, that can cope with error in CSV file
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   106
        while True:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   107
            try:
10669
155c29e0ed1c [py3k] use next builtin instead of next method
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10612
diff changeset
   108
                row = next(it)
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   109
            # End of CSV, break
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   110
            except StopIteration:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   111
                break
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   112
            # Error in CSV, ignore line and continue
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   113
            except csvmod.Error:
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   114
                continue
10807
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
   115
            if PY2:
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
   116
                decoded = [item.decode(encoding) for item in row]
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
   117
            else:
bb0c7dbd1fe7 [dataimport] fix ucsvreader for python3
Julien Cristau <julien.cristau@logilab.fr>
parents: 10669
diff changeset
   118
                decoded = row
10513
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   119
            if not skip_empty or any(decoded):
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   120
                yield decoded
7bec01a59f92 [dataimport] dispatch and deprecate old code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   121