devtools/dataimport.py
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Fri, 18 Dec 2009 14:28:29 +0100
changeset 4140 46ddd27a4ca4
parent 4136 47060a66c97f
child 4152 30fd1229137d
permissions -rw-r--r--
tweaks output
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     2
"""This module provides tools to import tabular data.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     3
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     4
:organization: Logilab
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     5
:copyright: 2001-2009 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     6
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     7
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     8
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     9
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    10
Example of use (run this with `cubicweb-ctl shell instance import-script.py`):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    11
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    12
.. sourcecode:: python
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    13
3318
5b47b9f09bca documentation : fixed docstring
Alexandre Fayolle <alexandre.fayolle@logilab.fr>
parents: 3029
diff changeset
    14
  from cubicweb.devtools.dataimport import *
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    15
  # define data generators
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    16
  GENERATORS = []
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    17
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    18
  USERS = [('Prenom', 'firstname', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    19
           ('Nom', 'surname', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    20
           ('Identifiant', 'login', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    21
           ]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    22
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    23
  def gen_users(ctl):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    24
      for row in ctl.get_data('utilisateurs'):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    25
          entity = mk_entity(row, USERS)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    26
          entity['upassword'] = u'motdepasse'
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    27
          ctl.check('login', entity['login'], None)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    28
          ctl.store.add('CWUser', entity)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    29
          email = {'address': row['email']}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    30
          ctl.store.add('EmailAddress', email)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
    31
          ctl.store.relate(entity['eid'], 'use_email', email['eid'])
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
    32
          ctl.store.rql('SET U in_group G WHERE G name "users", U eid %(x)s', {'x':entity['eid']})
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    33
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    34
  CHK = [('login', check_doubles, 'Utilisateurs Login',
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    35
          'Deux utilisateurs ne devraient pas avoir le même login.'),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    36
         ]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    37
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    38
  GENERATORS.append( (gen_users, CHK) )
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    39
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    40
  # create controller
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    41
  ctl = CWImportController(RQLObjectStore())
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    42
  ctl.askerror = True
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    43
  ctl.generators = GENERATORS
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    44
  ctl.store._checkpoint = checkpoint
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    45
  ctl.store._rql = rql
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    46
  ctl.data['utilisateurs'] = lazytable(utf8csvreader(open('users.csv')))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    47
  # run
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    48
  ctl.run()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    49
  sys.exit(0)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    50
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    51
"""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    52
__docformat__ = "restructuredtext en"
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    53
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    54
import sys, csv, traceback
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    55
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    56
from logilab.common import shellutils
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    57
from logilab.common.deprecation import deprecated
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    58
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    59
def ucsvreader_pb(filepath, encoding='utf-8', separator=',', quote='"',
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    60
                  skipfirst=False, withpb=True):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    61
    """same as ucsvreader but a progress bar is displayed as we iter on rows"""
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    62
    rowcount = int(shellutils.Execute('wc -l %s' % filepath).out.strip().split()[0])
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    63
    if skipfirst:
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    64
        rowcount -= 1
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    65
    if withpb:
4140
46ddd27a4ca4 tweaks output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 4136
diff changeset
    66
        pb = shellutils.ProgressBar(rowcount, 50)
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    67
    for urow in ucsvreader(file(filepath), encoding, separator, quote, skipfirst):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    68
        yield urow
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    69
        if withpb:
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    70
            pb.update()
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    71
    print ' %s rows imported' % rowcount
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    72
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    73
def ucsvreader(stream, encoding='utf-8', separator=',', quote='"',
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    74
               skipfirst=False):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    75
    """A csv reader that accepts files with any encoding and outputs unicode
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    76
    strings
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    77
    """
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    78
    it = iter(csv.reader(stream, delimiter=separator, quotechar=quote))
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    79
    if skipfirst:
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    80
        it.next()
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    81
    for row in it:
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    82
        yield [item.decode(encoding) for item in row]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    83
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    84
utf8csvreader = deprecated('use ucsvreader instead')(ucsvreader)
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    85
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    86
def commit_every(nbit, store, it):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    87
    for i, x in enumerate(it):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    88
        if i % nbit:
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    89
            store.checkpoint()
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
    90
        yield x
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    91
def lazytable(reader):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    92
    """The first row is taken to be the header of the table and
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    93
    used to output a dict for each row of data.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    94
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    95
    >>> data = lazytable(utf8csvreader(open(filename)))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    96
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    97
    header = reader.next()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    98
    for row in reader:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    99
        yield dict(zip(header, row))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   100
3029
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
   101
def tell(msg):
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
   102
    print msg
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
   103
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   104
# base sanitizing functions #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   105
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   106
def capitalize_if_unicase(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   107
    if txt.isupper() or txt.islower():
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   108
        return txt.capitalize()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   109
    return txt
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   110
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   111
def no_space(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   112
    return txt.replace(' ','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   113
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   114
def no_uspace(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   115
    return txt.replace(u'\xa0','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   116
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   117
def no_dash(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   118
    return txt.replace('-','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   119
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   120
def alldigits(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   121
    if txt.isdigit():
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   122
        return txt
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   123
    else:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   124
        return u''
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   125
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   126
def strip(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   127
    return txt.strip()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   128
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   129
# base checks #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   130
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   131
def check_doubles(buckets):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   132
    """Extract the keys that have more than one item in their bucket."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   133
    return [(key, len(value)) for key,value in buckets.items() if len(value) > 1]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   134
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   135
def check_doubles_not_none(buckets):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   136
    """Extract the keys that have more than one item in their bucket."""
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   137
    return [(key, len(value)) for key,value in buckets.items() if key is not None and len(value) > 1]
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   138
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   139
# make entity helper #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   140
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   141
def mk_entity(row, map):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   142
    """Return a dict made from sanitized mapped values.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   143
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   144
    >>> row = {'myname': u'dupont'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   145
    >>> map = [('myname', u'name', (capitalize_if_unicase,))]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   146
    >>> mk_entity(row, map)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   147
    {'name': u'Dupont'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   148
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   149
    res = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   150
    for src, dest, funcs in map:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   151
        res[dest] = row[src]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   152
        for func in funcs:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   153
            res[dest] = func(res[dest])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   154
    return res
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   155
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   156
# object stores
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   157
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   158
class ObjectStore(object):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   159
    """Store objects in memory for faster testing. Will not
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   160
    enforce the constraints of the schema and hence will miss
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   161
    some problems.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   162
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   163
    >>> store = ObjectStore()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   164
    >>> user = {'login': 'johndoe'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   165
    >>> store.add('CWUser', user)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   166
    >>> group = {'name': 'unknown'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   167
    >>> store.add('CWUser', group)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   168
    >>> store.relate(user['eid'], 'in_group', group['eid'])
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   169
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   170
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   171
    def __init__(self):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   172
        self.items = []
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   173
        self.eids = {}
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   174
        self.types = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   175
        self.relations = set()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   176
        self.indexes = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   177
        self._rql = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   178
        self._checkpoint = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   179
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   180
    def _put(self, type, item):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   181
        self.items.append(item)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   182
        return len(self.items) - 1
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   183
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   184
    def add(self, type, item):
3486
ea6bf6f9ba0c [cwctl] improve dialog messages
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3318
diff changeset
   185
        assert isinstance(item, dict), 'item is not a dict but a %s' % type(item)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   186
        eid = item['eid'] = self._put(type, item)
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   187
        self.eids[eid] = item
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   188
        self.types.setdefault(type, []).append(eid)
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   189
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   190
    def relate(self, eid_from, rtype, eid_to):
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   191
        eids_valid = (eid_from < len(self.items) and eid_to <= len(self.items))
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   192
        assert eids_valid, 'eid error %s %s' % (eid_from, eid_to)
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   193
        self.relations.add( (eid_from, rtype, eid_to) )
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   194
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   195
    def build_index(self, name, type, func):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   196
        index = {}
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   197
        for eid in self.types[type]:
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   198
            index.setdefault(func(self.eids[eid]), []).append(eid)
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   199
        self.indexes[name] = index
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   200
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   201
    def get_many(self, name, key):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   202
        return self.indexes[name].get(key, [])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   203
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   204
    def get_one(self, name, key):
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   205
        eids = self.indexes[name].get(key, [])
3486
ea6bf6f9ba0c [cwctl] improve dialog messages
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3318
diff changeset
   206
        assert len(eids) == 1, 'expected a single one got %i' % len(eids)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   207
        return eids[0]
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   208
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   209
    def find(self, type, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   210
        for idx in self.types[type]:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   211
            item = self.items[idx]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   212
            if item[key] == value:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   213
                yield item
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   214
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   215
    def checkpoint(self):
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   216
        pass
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   217
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   218
class RQLObjectStore(ObjectStore):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   219
    """ObjectStore that works with an actual RQL repository."""
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   220
    _rql = None # bw compat
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   221
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   222
    def __init__(self, session=None, checkpoint=None):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   223
        ObjectStore.__init__(self)
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   224
        if session is not None:
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   225
            if not hasattr(session, 'set_pool'):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   226
                # connection
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   227
                cnx = session
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   228
                session = session.request()
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   229
                session.set_pool = lambda : None
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   230
                checkpoint = checkpoint or cnx.commit
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   231
            self.session = session
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   232
            self.checkpoint = checkpoint or session.commit
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   233
        elif checkpoint is not None:
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   234
            self.checkpoint = checkpoint
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   235
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   236
    def rql(self, *args):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   237
        if self._rql is not None:
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   238
            return self._rql(*args)
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   239
        self.session.set_pool()
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   240
        return self.session.execute(*args)
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   241
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   242
    def create_entity(self, *args, **kwargs):
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   243
        self.session.set_pool()
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   244
        entity = self.session.create_entity(*args, **kwargs)
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   245
        self.eids[entity.eid] = entity
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   246
        self.types.setdefault(args[0], []).append(entity.eid)
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   247
        return entity
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   248
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   249
    def _put(self, type, item):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   250
        query = ('INSERT %s X: ' % type) + ', '.join(['X %s %%(%s)s' % (key,key) for key in item])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   251
        return self.rql(query, item)[0][0]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   252
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   253
    def relate(self, eid_from, rtype, eid_to):
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   254
        self.rql('SET X %s Y WHERE X eid %%(x)s, Y eid %%(y)s' % rtype,
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   255
                  {'x': int(eid_from), 'y': int(eid_to)}, ('x', 'y'))
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   256
        self.relations.add( (eid_from, rtype, eid_to) )
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   257
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   258
# import controller #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   259
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   260
class CWImportController(object):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   261
    """Controller of the data import process.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   262
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   263
    >>> ctl = CWImportController(store)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   264
    >>> ctl.generators = list_of_data_generators
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   265
    >>> ctl.data = dict_of_data_tables
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   266
    >>> ctl.run()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   267
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   268
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   269
    def __init__(self, store):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   270
        self.store = store
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   271
        self.generators = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   272
        self.data = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   273
        self.errors = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   274
        self.askerror = False
3029
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
   275
        self._tell = tell
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   276
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   277
    def check(self, type, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   278
        self._checks.setdefault(type, {}).setdefault(key, []).append(value)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   279
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   280
    def check_map(self, entity, key, map, default):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   281
        try:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   282
            entity[key] = map[entity[key]]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   283
        except KeyError:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   284
            self.check(key, entity[key], None)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   285
            entity[key] = default
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   286
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   287
    def run(self):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   288
        self.errors = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   289
        for func, checks in self.generators:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   290
            self._checks = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   291
            func_name = func.__name__[4:]
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   292
            question = 'Importing %s' % func_name
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   293
            self.tell(question)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   294
            try:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   295
                func(self)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   296
            except:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   297
                import StringIO
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   298
                tmp = StringIO.StringIO()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   299
                traceback.print_exc(file=tmp)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   300
                print tmp.getvalue()
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   301
                # use a list to avoid counting a <nb lines> errors instead of one
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   302
                self.errors[func_name] = ('Erreur lors de la transformation',
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   303
                                          [tmp.getvalue().splitlines()])
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   304
            for key, func, title, help in checks:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   305
                buckets = self._checks.get(key)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   306
                if buckets:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   307
                    err = func(buckets)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   308
                    if err:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   309
                        self.errors[title] = (help, err)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   310
            self.store.checkpoint()
4140
46ddd27a4ca4 tweaks output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 4136
diff changeset
   311
        self.tell('\nImport completed: %i entities (%i types), %i relations'
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   312
                  % (len(self.store.eids), len(self.store.types),
4136
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   313
                     len(self.store.relations)))
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   314
        nberrors = sum(len(err[1]) for err in self.errors.values())
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   315
        if nberrors:
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   316
            print '%s errors' % nberrors
47060a66c97f dataimport refactoring / improvments, keeping bw compat (for now)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 3486
diff changeset
   317
        if self.errors and self.askerror and confirm('Display errors?'):
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   318
            import pprint
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   319
            pprint.pprint(self.errors)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   320
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   321
    def get_data(self, key):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   322
        return self.data.get(key)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   323
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   324
    def index(self, name, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   325
        self.store.indexes.setdefault(name, {}).setdefault(key, []).append(value)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   326
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   327
    def tell(self, msg):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   328
        self._tell(msg)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   329
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   330
def confirm(question):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   331
    """A confirm function that asks for yes/no/abort and exits on abort."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   332
    answer = shellutils.ASK.ask(question, ('Y','n','abort'), 'Y')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   333
    if answer == 'abort':
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   334
        sys.exit(1)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   335
    return answer == 'Y'