devtools/dataimport.py
changeset 4527 67ab70e98488
parent 4252 6c4f109c2b03
child 4613 141a4f613f8a
equal deleted inserted replaced
4526:75dff36ff7a3 4527:67ab70e98488
    37 
    37 
    38   GENERATORS.append( (gen_users, CHK) )
    38   GENERATORS.append( (gen_users, CHK) )
    39 
    39 
    40   # create controller
    40   # create controller
    41   ctl = CWImportController(RQLObjectStore())
    41   ctl = CWImportController(RQLObjectStore())
    42   ctl.askerror = True
    42   ctl.askerror = 1
    43   ctl.generators = GENERATORS
    43   ctl.generators = GENERATORS
    44   ctl.store._checkpoint = checkpoint
    44   ctl.store._checkpoint = checkpoint
    45   ctl.store._rql = rql
    45   ctl.store._rql = rql
    46   ctl.data['utilisateurs'] = lazytable(utf8csvreader(open('users.csv')))
    46   ctl.data['utilisateurs'] = lazytable(utf8csvreader(open('users.csv')))
    47   # run
    47   # run
    48   ctl.run()
    48   ctl.run()
    49   sys.exit(0)
    49   sys.exit(0)
    50 
    50 
       
    51 
       
    52 .. BUG fichier à une colonne pose un problème de parsing
       
    53 .. TODO rollback()
    51 """
    54 """
    52 __docformat__ = "restructuredtext en"
    55 __docformat__ = "restructuredtext en"
    53 
    56 
    54 import sys
    57 import sys
    55 import csv
    58 import csv
   108         yield dict(zip(header, row))
   111         yield dict(zip(header, row))
   109 
   112 
   110 def mk_entity(row, map):
   113 def mk_entity(row, map):
   111     """Return a dict made from sanitized mapped values.
   114     """Return a dict made from sanitized mapped values.
   112 
   115 
       
   116     ValidationError can be raised on unexpected values found in checkers
       
   117 
   113     >>> row = {'myname': u'dupont'}
   118     >>> row = {'myname': u'dupont'}
   114     >>> map = [('myname', u'name', (capitalize_if_unicase,))]
   119     >>> map = [('myname', u'name', (capitalize_if_unicase,))]
   115     >>> mk_entity(row, map)
   120     >>> mk_entity(row, map)
   116     {'name': u'Dupont'}
   121     {'name': u'Dupont'}
       
   122     >>> row = {'myname': u'dupont', 'optname': u''}
       
   123     >>> map = [('myname', u'name', (capitalize_if_unicase,)),
       
   124     ...        ('optname', u'MARKER', (optional,))]
       
   125     >>> mk_entity(row, map)
       
   126     {'name': u'Dupont'}
   117     """
   127     """
   118     res = {}
   128     res = {}
       
   129     assert isinstance(row, dict)
       
   130     assert isinstance(map, list)
   119     for src, dest, funcs in map:
   131     for src, dest, funcs in map:
       
   132         assert not (required in funcs and optional in funcs), "optional and required checks are exclusive"
   120         res[dest] = row[src]
   133         res[dest] = row[src]
   121         for func in funcs:
   134         try:
   122             res[dest] = func(res[dest])
   135             for func in funcs:
       
   136                 res[dest] = func(res[dest])
       
   137             if res[dest] is None or res[dest]==False:
       
   138                 raise AssertionError('undetermined value')
       
   139         except AssertionError, err:
       
   140             if optional in funcs:
       
   141                 # Forget this field if exception is coming from optional function
       
   142                del res[dest]
       
   143             else:
       
   144                raise AssertionError('error with "%s" field: %s' % (src, err))
   123     return res
   145     return res
   124 
   146 
   125 
   147 
   126 # user interactions ############################################################
   148 # user interactions ############################################################
   127 
   149 
   161 def capitalize_if_unicase(txt):
   183 def capitalize_if_unicase(txt):
   162     if txt.isupper() or txt.islower():
   184     if txt.isupper() or txt.islower():
   163         return txt.capitalize()
   185         return txt.capitalize()
   164     return txt
   186     return txt
   165 
   187 
       
   188 def uppercase(txt):
       
   189     return txt.upper()
       
   190 
       
   191 def lowercase(txt):
       
   192     return txt.lower()
       
   193 
   166 def no_space(txt):
   194 def no_space(txt):
   167     return txt.replace(' ','')
   195     return txt.replace(' ','')
   168 
   196 
   169 def no_uspace(txt):
   197 def no_uspace(txt):
   170     return txt.replace(u'\xa0','')
   198     return txt.replace(u'\xa0','')
   171 
   199 
   172 def no_dash(txt):
   200 def no_dash(txt):
   173     return txt.replace('-','')
   201     return txt.replace('-','')
   174 
   202 
       
   203 def decimal(value):
       
   204     """cast to float but with comma replacement
       
   205 
       
   206     We take care of some locale format as replacing ',' by '.'"""
       
   207     value = value.replace(',', '.')
       
   208     try:
       
   209         return float(value)
       
   210     except Exception, err:
       
   211         raise AssertionError(err)
       
   212 
       
   213 def integer(value):
       
   214     try:
       
   215         return int(value)
       
   216     except Exception, err:
       
   217         raise AssertionError(err)
       
   218 
       
   219 def strip(txt):
       
   220     return txt.strip()
       
   221 
       
   222 def yesno(value):
       
   223     return value.lower()[0] in 'yo1'
       
   224 
       
   225 def isalpha(value):
       
   226     if value.isalpha():
       
   227         return value
       
   228     raise AssertionError("not all characters in the string alphabetic")
       
   229 
       
   230 def optional(value):
       
   231     """validation error will not been raised if you add this checker in chain"""
       
   232     return value
       
   233 
       
   234 def required(value):
       
   235     """raise AssertionError is value is empty
       
   236 
       
   237     This check should be often found in last position in the chain.
       
   238     """
       
   239     if bool(value):
       
   240         return value
       
   241     raise AssertionError("required")
       
   242 
       
   243 @deprecated('use required(value)')
       
   244 def nonempty(value):
       
   245     return required(value)
       
   246 
       
   247 @deprecated('use integer(value)')
   175 def alldigits(txt):
   248 def alldigits(txt):
   176     if txt.isdigit():
   249     if txt.isdigit():
   177         return txt
   250         return txt
   178     else:
   251     else:
   179         return u''
   252         return u''
   180 
   253 
   181 def strip(txt):
       
   182     return txt.strip()
       
   183 
       
   184 
   254 
   185 # base integrity checking functions ############################################
   255 # base integrity checking functions ############################################
   186 
   256 
   187 def check_doubles(buckets):
   257 def check_doubles(buckets):
   188     """Extract the keys that have more than one item in their bucket."""
   258     """Extract the keys that have more than one item in their bucket."""
   194 
   264 
   195 
   265 
   196 # object stores #################################################################
   266 # object stores #################################################################
   197 
   267 
   198 class ObjectStore(object):
   268 class ObjectStore(object):
   199     """Store objects in memory for faster testing. Will not
   269     """Store objects in memory for *faster* validation (development mode)
   200     enforce the constraints of the schema and hence will miss
   270 
   201     some problems.
   271     But it will not enforce the constraints of the schema and hence will miss some problems
   202 
   272 
   203     >>> store = ObjectStore()
   273     >>> store = ObjectStore()
   204     >>> user = {'login': 'johndoe'}
   274     >>> user = {'login': 'johndoe'}
   205     >>> store.add('CWUser', user)
   275     >>> store.add('CWUser', user)
   206     >>> group = {'name': 'unknown'}
   276     >>> group = {'name': 'unknown'}
   207     >>> store.add('CWUser', group)
   277     >>> store.add('CWUser', group)
   208     >>> store.relate(user['eid'], 'in_group', group['eid'])
   278     >>> store.relate(user['eid'], 'in_group', group['eid'])
   209     """
   279     """
   210 
       
   211     def __init__(self):
   280     def __init__(self):
   212         self.items = []
   281         self.items = []
   213         self.eids = {}
   282         self.eids = {}
   214         self.types = {}
   283         self.types = {}
   215         self.relations = set()
   284         self.relations = set()
   226         eid = item['eid'] = self._put(type, item)
   295         eid = item['eid'] = self._put(type, item)
   227         self.eids[eid] = item
   296         self.eids[eid] = item
   228         self.types.setdefault(type, []).append(eid)
   297         self.types.setdefault(type, []).append(eid)
   229 
   298 
   230     def relate(self, eid_from, rtype, eid_to):
   299     def relate(self, eid_from, rtype, eid_to):
   231         eids_valid = (eid_from < len(self.items) and eid_to <= len(self.items))
   300         """Add new relation (reverse type support is available)
   232         assert eids_valid, 'eid error %s %s' % (eid_from, eid_to)
   301 
   233         self.relations.add( (eid_from, rtype, eid_to) )
   302         >>> 1,2 = eid_from, eid_to
   234 
   303         >>> self.relate(eid_from, 'in_group', eid_to)
   235     def build_index(self, name, type, func):
   304         1, 'in_group', 2
       
   305         >>> self.relate(eid_from, 'reverse_in_group', eid_to)
       
   306         2, 'in_group', 1
       
   307         """
       
   308         if rtype.startswith('reverse_'):
       
   309             eid_from, eid_to = eid_to, eid_from
       
   310             rtype = rtype[8:]
       
   311         relation = eid_from, rtype, eid_to
       
   312         self.relations.add(relation)
       
   313         return relation
       
   314 
       
   315     def build_index(self, name, type, func=None):
   236         index = {}
   316         index = {}
       
   317         if func is None or not callable(func):
       
   318             func = lambda x: x['eid']
   237         for eid in self.types[type]:
   319         for eid in self.types[type]:
   238             index.setdefault(func(self.eids[eid]), []).append(eid)
   320             index.setdefault(func(self.eids[eid]), []).append(eid)
       
   321         assert index, "new index '%s' cannot be empty" % name
   239         self.indexes[name] = index
   322         self.indexes[name] = index
   240 
   323 
       
   324     def build_rqlindex(self, name, type, key, rql, rql_params=False, func=None):
       
   325         """build an index by rql query
       
   326 
       
   327         rql should return eid in first column
       
   328         ctl.store.build_index('index_name', 'users', 'login', 'Any U WHERE U is CWUser')
       
   329         """
       
   330         rset = self.rql(rql, rql_params or {})
       
   331         for entity in rset.entities():
       
   332             getattr(entity, key) # autopopulate entity with key attribute
       
   333             self.eids[entity.eid] = dict(entity)
       
   334             if entity.eid not in self.types.setdefault(type, []):
       
   335                 self.types[type].append(entity.eid)
       
   336         assert self.types[type], "new index type '%s' cannot be empty (0 record found)" % type
       
   337 
       
   338         # Build index with specified key
       
   339         func = lambda x: x[key]
       
   340         self.build_index(name, type, func)
       
   341 
       
   342     @deprecated('get_many() deprecated. Use fetch() instead')
   241     def get_many(self, name, key):
   343     def get_many(self, name, key):
   242         return self.indexes[name].get(key, [])
   344         return self.fetch(name, key, unique=False)
   243 
   345 
       
   346     @deprecated('get_one() deprecated. Use fetch(..., unique=True) instead')
   244     def get_one(self, name, key):
   347     def get_one(self, name, key):
       
   348         return self.fetch(name, key, unique=True)
       
   349 
       
   350     def fetch(self, name, key, unique=False, decorator=None):
       
   351         """
       
   352             decorator is a callable method or an iterator of callable methods (usually a lambda function)
       
   353             decorator=lambda x: x[:1] (first value is returned)
       
   354 
       
   355             We can use validation check function available in _entity
       
   356         """
   245         eids = self.indexes[name].get(key, [])
   357         eids = self.indexes[name].get(key, [])
   246         assert len(eids) == 1, 'expected a single one got %i' % len(eids)
   358         if decorator is not None:
   247         return eids[0]
   359             if not hasattr(decorator, '__iter__'):
       
   360                 decorator = (decorator,)
       
   361             for f in decorator:
       
   362                 eids = f(eids)
       
   363         if unique:
       
   364             assert len(eids) == 1, u'expected a single one value for key "%s" in index "%s". Got %i' % (key, name, len(eids))
       
   365             eids = eids[0] # FIXME maybe it's better to keep an iterator here ?
       
   366         return eids
   248 
   367 
   249     def find(self, type, key, value):
   368     def find(self, type, key, value):
   250         for idx in self.types[type]:
   369         for idx in self.types[type]:
   251             item = self.items[idx]
   370             item = self.items[idx]
   252             if item[key] == value:
   371             if item[key] == value:
   253                 yield item
   372                 yield item
   254 
   373 
       
   374     def rql(self, *args):
       
   375         if self._rql is not None:
       
   376             return self._rql(*args)
       
   377 
   255     def checkpoint(self):
   378     def checkpoint(self):
   256         pass
   379         pass
   257 
   380 
   258 
   381 
   259 class RQLObjectStore(ObjectStore):
   382 class RQLObjectStore(ObjectStore):
   260     """ObjectStore that works with an actual RQL repository."""
   383     """ObjectStore that works with an actual RQL repository (production mode)"""
   261     _rql = None # bw compat
   384     _rql = None # bw compat
   262 
   385 
   263     def __init__(self, session=None, checkpoint=None):
   386     def __init__(self, session=None, checkpoint=None):
   264         ObjectStore.__init__(self)
   387         ObjectStore.__init__(self)
   265         if session is not None:
   388         if session is not None:
   290     def _put(self, type, item):
   413     def _put(self, type, item):
   291         query = ('INSERT %s X: ' % type) + ', '.join(['X %s %%(%s)s' % (key,key) for key in item])
   414         query = ('INSERT %s X: ' % type) + ', '.join(['X %s %%(%s)s' % (key,key) for key in item])
   292         return self.rql(query, item)[0][0]
   415         return self.rql(query, item)[0][0]
   293 
   416 
   294     def relate(self, eid_from, rtype, eid_to):
   417     def relate(self, eid_from, rtype, eid_to):
       
   418         # if reverse relation is found, eids are exchanged
       
   419         eid_from, rtype, eid_to = super(RQLObjectStore, self).relate(eid_from, rtype, eid_to)
   295         self.rql('SET X %s Y WHERE X eid %%(x)s, Y eid %%(y)s' % rtype,
   420         self.rql('SET X %s Y WHERE X eid %%(x)s, Y eid %%(y)s' % rtype,
   296                   {'x': int(eid_from), 'y': int(eid_to)}, ('x', 'y'))
   421                   {'x': int(eid_from), 'y': int(eid_to)}, ('x', 'y'))
   297         self.relations.add( (eid_from, rtype, eid_to) )
       
   298 
   422 
   299 
   423 
   300 # the import controller ########################################################
   424 # the import controller ########################################################
   301 
   425 
   302 class CWImportController(object):
   426 class CWImportController(object):
   306     >>> ctl.generators = list_of_data_generators
   430     >>> ctl.generators = list_of_data_generators
   307     >>> ctl.data = dict_of_data_tables
   431     >>> ctl.data = dict_of_data_tables
   308     >>> ctl.run()
   432     >>> ctl.run()
   309     """
   433     """
   310 
   434 
   311     def __init__(self, store, askerror=False, catcherrors=None, tell=tell,
   435     def __init__(self, store, askerror=0, catcherrors=None, tell=tell,
   312                  commitevery=50):
   436                  commitevery=50):
   313         self.store = store
   437         self.store = store
   314         self.generators = None
   438         self.generators = None
   315         self.data = {}
   439         self.data = {}
   316         self.errors = None
   440         self.errors = None
   348     def run(self):
   472     def run(self):
   349         self.errors = {}
   473         self.errors = {}
   350         for func, checks in self.generators:
   474         for func, checks in self.generators:
   351             self._checks = {}
   475             self._checks = {}
   352             func_name = func.__name__[4:]  # XXX
   476             func_name = func.__name__[4:]  # XXX
   353             self.tell('Importing %s' % func_name)
   477             self.tell("Import '%s'..." % func_name)
   354             try:
   478             try:
   355                 func(self)
   479                 func(self)
   356             except:
   480             except:
   357                 if self.catcherrors:
   481                 if self.catcherrors:
   358                     self.record_error(func_name, 'While calling %s' % func.__name__)
   482                     self.record_error(func_name, 'While calling %s' % func.__name__)
   363                 if buckets:
   487                 if buckets:
   364                     err = func(buckets)
   488                     err = func(buckets)
   365                     if err:
   489                     if err:
   366                         self.errors[title] = (help, err)
   490                         self.errors[title] = (help, err)
   367         self.store.checkpoint()
   491         self.store.checkpoint()
   368         self.tell('\nImport completed: %i entities (%i types), %i relations'
   492         nberrors = sum(len(err[1]) for err in self.errors.values())
       
   493         self.tell('\nImport completed: %i entities, %i types, %i relations and %i errors'
   369                   % (len(self.store.eids), len(self.store.types),
   494                   % (len(self.store.eids), len(self.store.types),
   370                      len(self.store.relations)))
   495                      len(self.store.relations), nberrors))
   371         nberrors = sum(len(err[1]) for err in self.errors.values())
   496         if self.errors:
   372         if nberrors:
   497             if self.askerror==2 or (self.askerror and confirm('Display errors ?')):
   373             print '%s errors' % nberrors
   498                 from pprint import pformat
   374         if self.errors and self.askerror and confirm('Display errors?'):
   499                 for errkey, error in self.errors.items():
   375             import pprint
   500                     self.tell("\n%s (%s): %d\n" % (error[0], errkey, len(error[1])))
   376             pprint.pprint(self.errors)
   501                     self.tell(pformat(sorted(error[1])))
   377 
   502 
   378     def get_data(self, key):
   503     def get_data(self, key):
   379         return self.data.get(key)
   504         return self.data.get(key)
   380 
   505 
   381     def index(self, name, key, value):
   506     def index(self, name, key, value, unique=False):
       
   507         """create a new index
       
   508 
       
   509         If unique is set to True, only first occurence will be kept not the following ones
       
   510         """
       
   511         if unique:
       
   512             try:
       
   513                 if value in self.store.indexes[name][key]:
       
   514                     return
       
   515             except KeyError:
       
   516                 # we're sure that one is the first occurence; so continue...
       
   517                 pass
   382         self.store.indexes.setdefault(name, {}).setdefault(key, []).append(value)
   518         self.store.indexes.setdefault(name, {}).setdefault(key, []).append(value)
   383 
   519 
   384     def tell(self, msg):
   520     def tell(self, msg):
   385         self._tell(msg)
   521         self._tell(msg)
   386 
   522