|
1 # copyright 2003-2015 LOGILAB S.A. (Paris, FRANCE), all rights reserved. |
|
2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr |
|
3 # |
|
4 # This file is part of CubicWeb. |
|
5 # |
|
6 # CubicWeb is free software: you can redistribute it and/or modify it under the |
|
7 # terms of the GNU Lesser General Public License as published by the Free |
|
8 # Software Foundation, either version 2.1 of the License, or (at your option) |
|
9 # any later version. |
|
10 # |
|
11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT |
|
12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
|
13 # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more |
|
14 # details. |
|
15 # |
|
16 # You should have received a copy of the GNU Lesser General Public License along |
|
17 # with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
|
18 """Functions to help importing CSV data""" |
|
19 |
|
20 from __future__ import absolute_import |
|
21 |
|
22 import csv as csvmod |
|
23 import warnings |
|
24 import os.path as osp |
|
25 |
|
26 from logilab.common import shellutils |
|
27 |
|
28 |
|
29 def count_lines(stream_or_filename): |
|
30 if isinstance(stream_or_filename, basestring): |
|
31 f = open(stream_or_filename) |
|
32 else: |
|
33 f = stream_or_filename |
|
34 f.seek(0) |
|
35 for i, line in enumerate(f): |
|
36 pass |
|
37 f.seek(0) |
|
38 return i+1 |
|
39 |
|
40 |
|
41 def ucsvreader_pb(stream_or_path, encoding='utf-8', delimiter=',', quotechar='"', |
|
42 skipfirst=False, withpb=True, skip_empty=True, separator=None, |
|
43 quote=None): |
|
44 """same as :func:`ucsvreader` but a progress bar is displayed as we iter on rows""" |
|
45 if separator is not None: |
|
46 delimiter = separator |
|
47 warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead") |
|
48 if quote is not None: |
|
49 quotechar = quote |
|
50 warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead") |
|
51 if isinstance(stream_or_path, basestring): |
|
52 if not osp.exists(stream_or_path): |
|
53 raise Exception("file doesn't exists: %s" % stream_or_path) |
|
54 stream = open(stream_or_path) |
|
55 else: |
|
56 stream = stream_or_path |
|
57 rowcount = count_lines(stream) |
|
58 if skipfirst: |
|
59 rowcount -= 1 |
|
60 if withpb: |
|
61 pb = shellutils.ProgressBar(rowcount, 50) |
|
62 for urow in ucsvreader(stream, encoding, delimiter, quotechar, |
|
63 skipfirst=skipfirst, skip_empty=skip_empty): |
|
64 yield urow |
|
65 if withpb: |
|
66 pb.update() |
|
67 print ' %s rows imported' % rowcount |
|
68 |
|
69 |
|
70 def ucsvreader(stream, encoding='utf-8', delimiter=',', quotechar='"', |
|
71 skipfirst=False, ignore_errors=False, skip_empty=True, |
|
72 separator=None, quote=None): |
|
73 """A csv reader that accepts files with any encoding and outputs unicode |
|
74 strings |
|
75 |
|
76 if skip_empty (the default), lines without any values specified (only |
|
77 separators) will be skipped. This is useful for Excel exports which may be |
|
78 full of such lines. |
|
79 """ |
|
80 if separator is not None: |
|
81 delimiter = separator |
|
82 warnings.warn("[3.20] 'separator' kwarg is deprecated, use 'delimiter' instead") |
|
83 if quote is not None: |
|
84 quotechar = quote |
|
85 warnings.warn("[3.20] 'quote' kwarg is deprecated, use 'quotechar' instead") |
|
86 it = iter(csvmod.reader(stream, delimiter=delimiter, quotechar=quotechar)) |
|
87 if not ignore_errors: |
|
88 if skipfirst: |
|
89 it.next() |
|
90 for row in it: |
|
91 decoded = [item.decode(encoding) for item in row] |
|
92 if not skip_empty or any(decoded): |
|
93 yield decoded |
|
94 else: |
|
95 if skipfirst: |
|
96 try: |
|
97 row = it.next() |
|
98 except csvmod.Error: |
|
99 pass |
|
100 # Safe version, that can cope with error in CSV file |
|
101 while True: |
|
102 try: |
|
103 row = it.next() |
|
104 # End of CSV, break |
|
105 except StopIteration: |
|
106 break |
|
107 # Error in CSV, ignore line and continue |
|
108 except csvmod.Error: |
|
109 continue |
|
110 decoded = [item.decode(encoding) for item in row] |
|
111 if not skip_empty or any(decoded): |
|
112 yield decoded |
|
113 |