1 # copyright 2003-2015 LOGILAB S.A. (Paris, FRANCE), all rights reserved. |
|
2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr |
|
3 # |
|
4 # This file is part of CubicWeb. |
|
5 # |
|
6 # CubicWeb is free software: you can redistribute it and/or modify it under the |
|
7 # terms of the GNU Lesser General Public License as published by the Free |
|
8 # Software Foundation, either version 2.1 of the License, or (at your option) |
|
9 # any later version. |
|
10 # |
|
11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT |
|
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
|
13 # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more |
|
14 # details. |
|
15 # |
|
16 # You should have received a copy of the GNU Lesser General Public License along |
|
17 # with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
|
18 """Functions to help importing CSV data""" |
|
19 from __future__ import absolute_import, print_function |
|
20 |
|
21 import codecs |
|
22 import csv as csvmod |
|
23 import warnings |
|
24 import os.path as osp |
|
25 |
|
26 from six import PY2, PY3, string_types |
|
27 |
|
28 from logilab.common import shellutils |
|
29 |
|
30 |
|
31 def count_lines(stream_or_filename): |
|
32 if isinstance(stream_or_filename, string_types): |
|
33 f = open(stream_or_filename) |
|
34 else: |
|
35 f = stream_or_filename |
|
36 f.seek(0) |
|
37 for i, line in enumerate(f): |
|
38 pass |
|
39 f.seek(0) |
|
40 return i+1 |
|
41 |
|
42 |
|
43 def ucsvreader_pb(stream_or_path, encoding='utf-8', delimiter=',', quotechar='"', |
|
44 skipfirst=False, withpb=True, skip_empty=True, separator=None, |
|
45 quote=None): |
|
46 """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows""" |
|
47 if separator is not None: |
|
48 delimiter = separator |
|
49 warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead") |
|
50 if quote is not None: |
|
51 quotechar = quote |
|
52 warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead") |
|
53 if isinstance(stream_or_path, string_types): |
|
54 stream = open(stream_or_path, 'rb') |
|
55 else: |
|
56 stream = stream_or_path |
|
57 rowcount = count_lines(stream) |
|
58 if skipfirst: |
|
59 rowcount -= 1 |
|
60 if withpb: |
|
61 pb = shellutils.ProgressBar(rowcount, 50) |
|
62 for urow in ucsvreader(stream, encoding, delimiter, quotechar, |
|
63 skipfirst=skipfirst, skip_empty=skip_empty): |
|
64 yield urow |
|
65 if withpb: |
|
66 pb.update() |
|
67 print(' %s rows imported' % rowcount) |
|
68 |
|
69 |
|
70 def ucsvreader(stream, encoding='utf-8', delimiter=',', quotechar='"', |
|
71 skipfirst=False, ignore_errors=False, skip_empty=True, |
|
72 separator=None, quote=None): |
|
73 """A csv reader that accepts files with any encoding and outputs unicode |
|
74 strings |
|
75 |
|
76 if skip_empty (the default), lines without any values specified (only |
|
77 separators) will be skipped. This is useful for Excel exports which may be |
|
78 full of such lines. |
|
79 """ |
|
80 if PY3: |
|
81 stream = codecs.getreader(encoding)(stream) |
|
82 if separator is not None: |
|
83 delimiter = separator |
|
84 warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead") |
|
85 if quote is not None: |
|
86 quotechar = quote |
|
87 warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead") |
|
88 it = iter(csvmod.reader(stream, delimiter=delimiter, quotechar=quotechar)) |
|
89 if not ignore_errors: |
|
90 if skipfirst: |
|
91 next(it) |
|
92 for row in it: |
|
93 if PY2: |
|
94 decoded = [item.decode(encoding) for item in row] |
|
95 else: |
|
96 decoded = row |
|
97 if not skip_empty or any(decoded): |
|
98 yield decoded |
|
99 else: |
|
100 if skipfirst: |
|
101 try: |
|
102 row = next(it) |
|
103 except csvmod.Error: |
|
104 pass |
|
105 # Safe version, that can cope with error in CSV file |
|
106 while True: |
|
107 try: |
|
108 row = next(it) |
|
109 # End of CSV, break |
|
110 except StopIteration: |
|
111 break |
|
112 # Error in CSV, ignore line and continue |
|
113 except csvmod.Error: |
|
114 continue |
|
115 if PY2: |
|
116 decoded = [item.decode(encoding) for item in row] |
|
117 else: |
|
118 decoded = row |
|
119 if not skip_empty or any(decoded): |
|
120 yield decoded |
|