devtools/dataimport.py
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Wed, 09 Dec 2009 12:22:21 +0100
changeset 4072 ead446e70c28
parent 3486 ea6bf6f9ba0c
child 4136 47060a66c97f
child 4212 ab6573088b4a
permissions -rw-r--r--
some api update
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     2
"""This module provides tools to import tabular data.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     3
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     4
:organization: Logilab
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     5
:copyright: 2001-2009 LOGILAB S.A. (Paris, FRANCE), license is LGPL v2.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     6
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     7
:license: GNU Lesser General Public License, v2.1 - http://www.gnu.org/licenses
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     8
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
     9
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    10
Example of use (run this with `cubicweb-ctl shell instance import-script.py`):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    11
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    12
.. sourcecode:: python
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    13
3318
5b47b9f09bca documentation : fixed docstring
Alexandre Fayolle <alexandre.fayolle@logilab.fr>
parents: 3029
diff changeset
    14
  from cubicweb.devtools.dataimport import *
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    15
  # define data generators
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    16
  GENERATORS = []
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    17
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    18
  USERS = [('Prenom', 'firstname', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    19
           ('Nom', 'surname', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    20
           ('Identifiant', 'login', ()),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    21
           ]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    22
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    23
  def gen_users(ctl):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    24
      for row in ctl.get_data('utilisateurs'):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    25
          entity = mk_entity(row, USERS)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    26
          entity['upassword'] = u'motdepasse'
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    27
          ctl.check('login', entity['login'], None)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    28
          ctl.store.add('CWUser', entity)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    29
          email = {'address': row['email']}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    30
          ctl.store.add('EmailAddress', email)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
    31
          ctl.store.relate(entity['eid'], 'use_email', email['eid'])
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
    32
          ctl.store.rql('SET U in_group G WHERE G name "users", U eid %(x)s', {'x':entity['eid']})
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    33
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    34
  CHK = [('login', check_doubles, 'Utilisateurs Login',
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    35
          'Deux utilisateurs ne devraient pas avoir le même login.'),
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    36
         ]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    37
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    38
  GENERATORS.append( (gen_users, CHK) )
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    39
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    40
  # create controller
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    41
  ctl = CWImportController(RQLObjectStore())
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    42
  ctl.askerror = True
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    43
  ctl.generators = GENERATORS
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    44
  ctl.store._checkpoint = checkpoint
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    45
  ctl.store._rql = rql
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    46
  ctl.data['utilisateurs'] = lazytable(utf8csvreader(open('users.csv')))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    47
  # run
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    48
  ctl.run()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    49
  sys.exit(0)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    50
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    51
"""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    52
__docformat__ = "restructuredtext en"
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    53
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    54
import sys, csv, traceback
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    55
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    56
from logilab.common import shellutils
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    57
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    58
def utf8csvreader(file, encoding='utf-8', separator=',', quote='"'):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    59
    """A csv reader that accepts files with any encoding and outputs
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    60
    unicode strings."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    61
    for row in csv.reader(file, delimiter=separator, quotechar=quote):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    62
        yield [item.decode(encoding) for item in row]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    63
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    64
def lazytable(reader):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    65
    """The first row is taken to be the header of the table and
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    66
    used to output a dict for each row of data.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    67
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    68
    >>> data = lazytable(utf8csvreader(open(filename)))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    69
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    70
    header = reader.next()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    71
    for row in reader:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    72
        yield dict(zip(header, row))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    73
3029
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
    74
def tell(msg):
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
    75
    print msg
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
    76
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    77
# base sanitizing functions #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    78
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    79
def capitalize_if_unicase(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    80
    if txt.isupper() or txt.islower():
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    81
        return txt.capitalize()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    82
    return txt
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    83
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    84
def no_space(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    85
    return txt.replace(' ','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    86
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    87
def no_uspace(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    88
    return txt.replace(u'\xa0','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    89
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    90
def no_dash(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    91
    return txt.replace('-','')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    92
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    93
def alldigits(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    94
    if txt.isdigit():
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    95
        return txt
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    96
    else:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    97
        return u''
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    98
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
    99
def strip(txt):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   100
    return txt.strip()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   101
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   102
# base checks #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   103
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   104
def check_doubles(buckets):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   105
    """Extract the keys that have more than one item in their bucket."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   106
    return [(key, len(value)) for key,value in buckets.items() if len(value) > 1]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   107
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   108
# make entity helper #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   109
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   110
def mk_entity(row, map):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   111
    """Return a dict made from sanitized mapped values.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   112
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   113
    >>> row = {'myname': u'dupont'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   114
    >>> map = [('myname', u'name', (capitalize_if_unicase,))]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   115
    >>> mk_entity(row, map)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   116
    {'name': u'Dupont'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   117
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   118
    res = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   119
    for src, dest, funcs in map:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   120
        res[dest] = row[src]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   121
        for func in funcs:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   122
            res[dest] = func(res[dest])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   123
    return res
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   124
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   125
# object stores
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   126
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   127
class ObjectStore(object):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   128
    """Store objects in memory for faster testing. Will not
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   129
    enforce the constraints of the schema and hence will miss
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   130
    some problems.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   131
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   132
    >>> store = ObjectStore()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   133
    >>> user = {'login': 'johndoe'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   134
    >>> store.add('CWUser', user)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   135
    >>> group = {'name': 'unknown'}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   136
    >>> store.add('CWUser', group)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   137
    >>> store.relate(user['eid'], 'in_group', group['eid'])
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   138
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   139
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   140
    def __init__(self):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   141
        self.items = []
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   142
        self.eids = {}
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   143
        self.types = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   144
        self.relations = set()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   145
        self.indexes = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   146
        self._rql = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   147
        self._checkpoint = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   148
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   149
    def _put(self, type, item):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   150
        self.items.append(item)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   151
        return len(self.items) - 1
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   152
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   153
    def add(self, type, item):
3486
ea6bf6f9ba0c [cwctl] improve dialog messages
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3318
diff changeset
   154
        assert isinstance(item, dict), 'item is not a dict but a %s' % type(item)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   155
        eid = item['eid'] = self._put(type, item)
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   156
        self.eids[eid] = item
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   157
        self.types.setdefault(type, []).append(eid)
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   158
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   159
    def relate(self, eid_from, rtype, eid_to):
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   160
        eids_valid = (eid_from < len(self.items) and eid_to <= len(self.items))
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   161
        assert eids_valid, 'eid error %s %s' % (eid_from, eid_to)
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   162
        self.relations.add( (eid_from, rtype, eid_to) )
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   163
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   164
    def build_index(self, name, type, func):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   165
        index = {}
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   166
        for eid in self.types[type]:
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   167
            index.setdefault(func(self.eids[eid]), []).append(eid)
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   168
        self.indexes[name] = index
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   169
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   170
    def get_many(self, name, key):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   171
        return self.indexes[name].get(key, [])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   172
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   173
    def get_one(self, name, key):
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   174
        eids = self.indexes[name].get(key, [])
3486
ea6bf6f9ba0c [cwctl] improve dialog messages
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3318
diff changeset
   175
        assert len(eids) == 1, 'expected a single one got %i' % len(eids)
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   176
        return eids[0]
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   177
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   178
    def find(self, type, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   179
        for idx in self.types[type]:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   180
            item = self.items[idx]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   181
            if item[key] == value:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   182
                yield item
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   183
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   184
    def rql(self, query, args):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   185
        if self._rql:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   186
            return self._rql(query, args)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   187
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   188
    def checkpoint(self):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   189
        if self._checkpoint:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   190
            self._checkpoint()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   191
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   192
class RQLObjectStore(ObjectStore):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   193
    """ObjectStore that works with an actual RQL repository."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   194
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   195
    def _put(self, type, item):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   196
        query = ('INSERT %s X: ' % type) + ', '.join(['X %s %%(%s)s' % (key,key) for key in item])
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   197
        return self.rql(query, item)[0][0]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   198
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   199
    def relate(self, eid_from, rtype, eid_to):
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   200
        query = 'SET X %s Y WHERE X eid %%(from)s, Y eid %%(to)s' % rtype
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   201
        self.rql(query, {'from': int(eid_from), 'to': int(eid_to)})
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   202
        self.relations.add( (eid_from, rtype, eid_to) )
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   203
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   204
# import controller #####
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   205
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   206
class CWImportController(object):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   207
    """Controller of the data import process.
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   208
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   209
    >>> ctl = CWImportController(store)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   210
    >>> ctl.generators = list_of_data_generators
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   211
    >>> ctl.data = dict_of_data_tables
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   212
    >>> ctl.run()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   213
    """
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   214
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   215
    def __init__(self, store):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   216
        self.store = store
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   217
        self.generators = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   218
        self.data = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   219
        self.errors = None
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   220
        self.askerror = False
3029
bc573d5fb5b7 F [devtools] by default dataimport prints message on stdout
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 3003
diff changeset
   221
        self._tell = tell
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   222
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   223
    def check(self, type, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   224
        self._checks.setdefault(type, {}).setdefault(key, []).append(value)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   225
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   226
    def check_map(self, entity, key, map, default):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   227
        try:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   228
            entity[key] = map[entity[key]]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   229
        except KeyError:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   230
            self.check(key, entity[key], None)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   231
            entity[key] = default
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   232
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   233
    def run(self):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   234
        self.errors = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   235
        for func, checks in self.generators:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   236
            self._checks = {}
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   237
            func_name = func.__name__[4:]
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   238
            question = 'Importation de %s' % func_name
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   239
            self.tell(question)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   240
            try:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   241
                func(self)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   242
            except:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   243
                import StringIO
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   244
                tmp = StringIO.StringIO()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   245
                traceback.print_exc(file=tmp)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   246
                print tmp.getvalue()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   247
                self.errors[func_name] = ('Erreur lors de la transformation',
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   248
                                          tmp.getvalue().splitlines())
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   249
            for key, func, title, help in checks:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   250
                buckets = self._checks.get(key)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   251
                if buckets:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   252
                    err = func(buckets)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   253
                    if err:
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   254
                        self.errors[title] = (help, err)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   255
            self.store.checkpoint()
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   256
        errors = sum(len(err[1]) for err in self.errors.values())
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   257
        self.tell('Importation terminée. (%i objets, %i types, %i relations et %i erreurs).'
3003
2944ee420dca R [dataimport] rename uid to eid
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 2974
diff changeset
   258
                  % (len(self.store.eids), len(self.store.types),
2974
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   259
                     len(self.store.relations), errors))
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   260
        if self.errors and self.askerror and confirm('Afficher les erreurs ?'):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   261
            import pprint
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   262
            pprint.pprint(self.errors)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   263
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   264
    def get_data(self, key):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   265
        return self.data.get(key)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   266
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   267
    def index(self, name, key, value):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   268
        self.store.indexes.setdefault(name, {}).setdefault(key, []).append(value)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   269
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   270
    def tell(self, msg):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   271
        self._tell(msg)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   272
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   273
def confirm(question):
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   274
    """A confirm function that asks for yes/no/abort and exits on abort."""
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   275
    answer = shellutils.ASK.ask(question, ('Y','n','abort'), 'Y')
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   276
    if answer == 'abort':
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   277
        sys.exit(1)
3dfe497e5afa F tools to import data
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
diff changeset
   278
    return answer == 'Y'