devtools/dataimport.py
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Fri, 18 Sep 2009 11:06:42 +0200
branch3.5
changeset 3314 cfa77453e742
parent 3029 bc573d5fb5b7
child 3318 5b47b9f09bca
permissions -rw-r--r--
Added tag cubicweb-debian-version-3.5.1-1 for changeset f476cecd4690
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     2
"""This module provides tools to import tabular data.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     3
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     4
:organization: Logilab
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     5
:copyright: 2001-2009 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     6
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     7
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     8
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     9
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    10
Example of use (run this with `cubicweb-ctl shell instance import-script.py`):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    11
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    12
.. sourcecode:: python
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    13
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    14
  # define data generators
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    15
  GENERATORS = []
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    16
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    17
  USERS = [('Prenom', 'firstname', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    18
           ('Nom', 'surname', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    19
           ('Identifiant', 'login', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    20
           ]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    21
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    22
  def gen_users(ctl):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    23
      for row in ctl.get_data('utilisateurs'):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    24
          entity = mk_entity(row, USERS)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    25
          entity['upassword'] = u'motdepasse'
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    26
          ctl.check('login', entity['login'], None)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    27
          ctl.store.add('CWUser', entity)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    28
          email = {'address': row['email']}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    29
          ctl.store.add('EmailAddress', email)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
    30
          ctl.store.relate(entity['eid'], 'use_email', email['eid'])
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
    31
          ctl.store.rql('SET U in_group G WHERE G name "users", U eid %(x)s', {'x':entity['eid']})
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    32
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    33
  CHK = [('login', check_doubles, 'Utilisateurs Login',
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    34
          'Deux utilisateurs ne devraient pas avoir le même login.'),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    35
         ]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    36
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    37
  GENERATORS.append( (gen_users, CHK) )
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    38
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    39
  # create controller
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    40
  ctl = CWImportController(RQLObjectStore())
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    41
  ctl.askerror = True
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    42
  ctl.generators = GENERATORS
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    43
  ctl.store._checkpoint = checkpoint
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    44
  ctl.store._rql = rql
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    45
  ctl.data['utilisateurs'] = lazytable(utf8csvreader(open('users.csv')))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    46
  # run
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    47
  ctl.run()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    48
  sys.exit(0)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    49
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    50
"""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    51
__docformat__ = "restructuredtext en"
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    52
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    53
import sys, csv, traceback
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    54
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    55
from logilab.common import shellutils
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    56
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    57
def utf8csvreader(file, encoding='utf-8', separator=',', quote='"'):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    58
    """A csv reader that accepts files with any encoding and outputs
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    59
    unicode strings."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    60
    for row in csv.reader(file, delimiter=separator, quotechar=quote):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    61
        yield [item.decode(encoding) for item in row]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    62
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    63
def lazytable(reader):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    64
    """The first row is taken to be the header of the table and
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    65
    used to output a dict for each row of data.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    66
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    67
    >>> data = lazytable(utf8csvreader(open(filename)))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    68
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    69
    header = reader.next()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    70
    for row in reader:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    71
        yield dict(zip(header, row))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    72
3029
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
    73
def tell(msg):
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
    74
    print msg
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
    75
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    76
# base sanitizing functions #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    77
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    78
def capitalize_if_unicase(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    79
    if txt.isupper() or txt.islower():
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    80
        return txt.capitalize()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    81
    return txt
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    82
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    83
def no_space(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    84
    return txt.replace(' ','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    85
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    86
def no_uspace(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    87
    return txt.replace(u'\xa0','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    88
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    89
def no_dash(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    90
    return txt.replace('-','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    91
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    92
def alldigits(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    93
    if txt.isdigit():
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    94
        return txt
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    95
    else:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    96
        return u''
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    97
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    98
def strip(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    99
    return txt.strip()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   100
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   101
# base checks #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   102
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   103
def check_doubles(buckets):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   104
    """Extract the keys that have more than one item in their bucket."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   105
    return [(key, len(value)) for key,value in buckets.items() if len(value) > 1]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   106
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   107
# make entity helper #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   108
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   109
def mk_entity(row, map):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   110
    """Return a dict made from sanitized mapped values.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   111
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   112
    >>> row = {'myname': u'dupont'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   113
    >>> map = [('myname', u'name', (capitalize_if_unicase,))]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   114
    >>> mk_entity(row, map)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   115
    {'name': u'Dupont'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   116
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   117
    res = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   118
    for src, dest, funcs in map:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   119
        res[dest] = row[src]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   120
        for func in funcs:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   121
            res[dest] = func(res[dest])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   122
    return res
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   123
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   124
# object stores
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   125
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   126
class ObjectStore(object):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   127
    """Store objects in memory for faster testing. Will not
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   128
    enforce the constraints of the schema and hence will miss
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   129
    some problems.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   130
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   131
    >>> store = ObjectStore()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   132
    >>> user = {'login': 'johndoe'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   133
    >>> store.add('CWUser', user)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   134
    >>> group = {'name': 'unknown'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   135
    >>> store.add('CWUser', group)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   136
    >>> store.relate(user['eid'], 'in_group', group['eid'])
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   137
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   138
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   139
    def __init__(self):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   140
        self.items = []
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   141
        self.eids = {}
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   142
        self.types = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   143
        self.relations = set()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   144
        self.indexes = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   145
        self._rql = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   146
        self._checkpoint = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   147
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   148
    def _put(self, type, item):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   149
        self.items.append(item)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   150
        return len(self.items) - 1
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   151
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   152
    def add(self, type, item):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   153
        assert isinstance(item, dict), item
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   154
        eid = item['eid'] = self._put(type, item)
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   155
        self.eids[eid] = item
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   156
        self.types.setdefault(type, []).append(eid)
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   157
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   158
    def relate(self, eid_from, rtype, eid_to):
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   159
        eids_valid = (eid_from < len(self.items) and eid_to <= len(self.items))
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   160
        assert eids_valid, 'eid error %s %s' % (eid_from, eid_to)
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   161
        self.relations.add( (eid_from, rtype, eid_to) )
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   162
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   163
    def build_index(self, name, type, func):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   164
        index = {}
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   165
        for eid in self.types[type]:
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   166
            index.setdefault(func(self.eids[eid]), []).append(eid)
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   167
        self.indexes[name] = index
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   168
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   169
    def get_many(self, name, key):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   170
        return self.indexes[name].get(key, [])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   171
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   172
    def get_one(self, name, key):
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   173
        eids = self.indexes[name].get(key, [])
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   174
        assert len(eids) == 1
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   175
        return eids[0]
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   176
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   177
    def find(self, type, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   178
        for idx in self.types[type]:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   179
            item = self.items[idx]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   180
            if item[key] == value:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   181
                yield item
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   182
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   183
    def rql(self, query, args):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   184
        if self._rql:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   185
            return self._rql(query, args)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   186
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   187
    def checkpoint(self):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   188
        if self._checkpoint:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   189
            self._checkpoint()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   190
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   191
class RQLObjectStore(ObjectStore):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   192
    """ObjectStore that works with an actual RQL repository."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   193
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   194
    def _put(self, type, item):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   195
        query = ('INSERT %s X: ' % type) + ', '.join(['X %s %%(%s)s' % (key,key) for key in item])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   196
        return self.rql(query, item)[0][0]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   197
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   198
    def relate(self, eid_from, rtype, eid_to):
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   199
        query = 'SET X %s Y WHERE X eid %%(from)s, Y eid %%(to)s' % rtype
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   200
        self.rql(query, {'from': int(eid_from), 'to': int(eid_to)})
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   201
        self.relations.add( (eid_from, rtype, eid_to) )
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   202
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   203
# import controller #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   204
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   205
class CWImportController(object):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   206
    """Controller of the data import process.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   207
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   208
    >>> ctl = CWImportController(store)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   209
    >>> ctl.generators = list_of_data_generators
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   210
    >>> ctl.data = dict_of_data_tables
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   211
    >>> ctl.run()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   212
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   213
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   214
    def __init__(self, store):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   215
        self.store = store
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   216
        self.generators = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   217
        self.data = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   218
        self.errors = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   219
        self.askerror = False
3029
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
   220
        self._tell = tell
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   221
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   222
    def check(self, type, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   223
        self._checks.setdefault(type, {}).setdefault(key, []).append(value)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   224
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   225
    def check_map(self, entity, key, map, default):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   226
        try:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   227
            entity[key] = map[entity[key]]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   228
        except KeyError:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   229
            self.check(key, entity[key], None)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   230
            entity[key] = default
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   231
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   232
    def run(self):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   233
        self.errors = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   234
        for func, checks in self.generators:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   235
            self._checks = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   236
            func_name = func.__name__[4:]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   237
            question = 'Importation de %s' % func_name
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   238
            self.tell(question)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   239
            try:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   240
                func(self)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   241
            except:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   242
                import StringIO
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   243
                tmp = StringIO.StringIO()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   244
                traceback.print_exc(file=tmp)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   245
                print tmp.getvalue()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   246
                self.errors[func_name] = ('Erreur lors de la transformation',
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   247
                                          tmp.getvalue().splitlines())
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   248
            for key, func, title, help in checks:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   249
                buckets = self._checks.get(key)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   250
                if buckets:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   251
                    err = func(buckets)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   252
                    if err:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   253
                        self.errors[title] = (help, err)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   254
            self.store.checkpoint()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   255
        errors = sum(len(err[1]) for err in self.errors.values())
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   256
        self.tell('Importation terminée. (%i objets, %i types, %i relations et %i erreurs).'
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   257
                  % (len(self.store.eids), len(self.store.types),
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   258
                     len(self.store.relations), errors))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   259
        if self.errors and self.askerror and confirm('Afficher les erreurs ?'):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   260
            import pprint
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   261
            pprint.pprint(self.errors)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   262
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   263
    def get_data(self, key):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   264
        return self.data.get(key)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   265
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   266
    def index(self, name, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   267
        self.store.indexes.setdefault(name, {}).setdefault(key, []).append(value)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   268
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   269
    def tell(self, msg):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   270
        self._tell(msg)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   271
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   272
def confirm(question):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   273
    """A confirm function that asks for yes/no/abort and exits on abort."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   274
    answer = shellutils.ASK.ask(question, ('Y','n','abort'), 'Y')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   275
    if answer == 'abort':
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   276
        sys.exit(1)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   277
    return answer == 'Y'