dataimport.py
changeset 4912 9767cc516b4f
parent 4847 9466604ef448
child 4913 083b4d454192
equal deleted inserted replaced
4908:b3ad329cbe17 4912:9767cc516b4f
    36          ]
    36          ]
    37 
    37 
    38   GENERATORS.append( (gen_users, CHK) )
    38   GENERATORS.append( (gen_users, CHK) )
    39 
    39 
    40   # create controller
    40   # create controller
    41   ctl = CWImportController(RQLObjectStore())
    41   ctl = CWImportController(RQLObjectStore(cnx))
    42   ctl.askerror = 1
    42   ctl.askerror = 1
    43   ctl.generators = GENERATORS
    43   ctl.generators = GENERATORS
    44   ctl.store._checkpoint = checkpoint
       
    45   ctl.store._rql = rql
       
    46   ctl.data['utilisateurs'] = lazytable(utf8csvreader(open('users.csv')))
    44   ctl.data['utilisateurs'] = lazytable(utf8csvreader(open('users.csv')))
    47   # run
    45   # run
    48   ctl.run()
    46   ctl.run()
    49   sys.exit(0)
    47 
    50 
    48 .. BUG file with one column are not parsable
    51 
    49 .. TODO rollback() invocation is not possible yet
    52 .. BUG fichier à une colonne pose un problème de parsing
       
    53 .. TODO rollback()
       
    54 """
    50 """
    55 __docformat__ = "restructuredtext en"
    51 __docformat__ = "restructuredtext en"
    56 
    52 
    57 import sys
    53 import sys
    58 import csv
    54 import csv
    96 
    92 
    97 def commit_every(nbit, store, it):
    93 def commit_every(nbit, store, it):
    98     for i, x in enumerate(it):
    94     for i, x in enumerate(it):
    99         yield x
    95         yield x
   100         if nbit is not None and i % nbit:
    96         if nbit is not None and i % nbit:
   101             store.checkpoint()
    97             store.commit()
   102     if nbit is not None:
    98     if nbit is not None:
   103         store.checkpoint()
    99         store.commit()
   104 
   100 
   105 def lazytable(reader):
   101 def lazytable(reader):
   106     """The first row is taken to be the header of the table and
   102     """The first row is taken to be the header of the table and
   107     used to output a dict for each row of data.
   103     used to output a dict for each row of data.
   108 
   104 
   113         yield dict(zip(header, row))
   109         yield dict(zip(header, row))
   114 
   110 
   115 def mk_entity(row, map):
   111 def mk_entity(row, map):
   116     """Return a dict made from sanitized mapped values.
   112     """Return a dict made from sanitized mapped values.
   117 
   113 
   118     ValidationError can be raised on unexpected values found in checkers
   114     ValueError can be raised on unexpected values found in checkers
   119 
   115 
   120     >>> row = {'myname': u'dupont'}
   116     >>> row = {'myname': u'dupont'}
   121     >>> map = [('myname', u'name', (capitalize_if_unicase,))]
   117     >>> map = [('myname', u'name', (call_transform_method('title'),))]
   122     >>> mk_entity(row, map)
   118     >>> mk_entity(row, map)
   123     {'name': u'Dupont'}
   119     {'name': u'Dupont'}
   124     >>> row = {'myname': u'dupont', 'optname': u''}
   120     >>> row = {'myname': u'dupont', 'optname': u''}
   125     >>> map = [('myname', u'name', (capitalize_if_unicase,)),
   121     >>> map = [('myname', u'name', (call_transform_method('title'),)),
   126     ...        ('optname', u'MARKER', (optional,))]
   122     ...        ('optname', u'MARKER', (optional,))]
   127     >>> mk_entity(row, map)
   123     >>> mk_entity(row, map)
   128     {'name': u'Dupont'}
   124     {'name': u'Dupont', 'optname': None}
   129     """
   125     """
   130     res = {}
   126     res = {}
   131     assert isinstance(row, dict)
   127     assert isinstance(row, dict)
   132     assert isinstance(map, list)
   128     assert isinstance(map, list)
   133     for src, dest, funcs in map:
   129     for src, dest, funcs in map:
   134         assert not (required in funcs and optional in funcs), \
       
   135                "optional and required checks are exclusive"
       
   136         res[dest] = row[src]
   130         res[dest] = row[src]
   137         try:
   131         try:
   138             for func in funcs:
   132             for func in funcs:
   139                 res[dest] = func(res[dest])
   133                 res[dest] = func(res[dest])
   140                 if res[dest] is None:
   134                 if res[dest] is None:
   178 
   172 
   179 
   173 
   180 # base sanitizing/coercing functions ###########################################
   174 # base sanitizing/coercing functions ###########################################
   181 
   175 
   182 def optional(value):
   176 def optional(value):
   183     """validation error will not been raised if you add this checker in chain"""
   177     """checker to filter optional field
       
   178 
       
   179     If value is undefined (ex: empty string), return None that will
       
   180     break the checkers validation chain
       
   181 
       
   182     General use is to add 'optional' check in first condition to avoid
       
   183     ValueError by further checkers
       
   184 
       
   185     >>> MAPPER = [(u'value', 'value', (optional, int))]
       
   186     >>> row = {'value': u'XXX'}
       
   187     >>> mk_entity(row, MAPPER)
       
   188     {'value': None}
       
   189     >>> row = {'value': u'100'}
       
   190     >>> mk_entity(row, MAPPER)
       
   191     {'value': 100}
       
   192     """
   184     if value:
   193     if value:
   185         return value
   194         return value
   186     return None
   195     return None
   187 
   196 
   188 def required(value):
   197 def required(value):
   252         self.eids = {}
   261         self.eids = {}
   253         self.types = {}
   262         self.types = {}
   254         self.relations = set()
   263         self.relations = set()
   255         self.indexes = {}
   264         self.indexes = {}
   256         self._rql = None
   265         self._rql = None
   257         self._checkpoint = None
   266         self._commit = None
   258 
   267 
   259     def _put(self, type, item):
   268     def _put(self, type, item):
   260         self.items.append(item)
   269         self.items.append(item)
   261         return len(self.items) - 1
   270         return len(self.items) - 1
   262 
   271 
   265         eid = item['eid'] = self._put(type, item)
   274         eid = item['eid'] = self._put(type, item)
   266         self.eids[eid] = item
   275         self.eids[eid] = item
   267         self.types.setdefault(type, []).append(eid)
   276         self.types.setdefault(type, []).append(eid)
   268 
   277 
   269     def relate(self, eid_from, rtype, eid_to, inlined=False):
   278     def relate(self, eid_from, rtype, eid_to, inlined=False):
   270         """Add new relation (reverse type support is available)
   279         """Add new relation"""
   271 
       
   272         >>> 1,2 = eid_from, eid_to
       
   273         >>> self.relate(eid_from, 'in_group', eid_to)
       
   274         1, 'in_group', 2
       
   275         >>> self.relate(eid_from, 'reverse_in_group', eid_to)
       
   276         2, 'in_group', 1
       
   277         """
       
   278         if rtype.startswith('reverse_'):
       
   279             eid_from, eid_to = eid_to, eid_from
       
   280             rtype = rtype[8:]
       
   281         relation = eid_from, rtype, eid_to
   280         relation = eid_from, rtype, eid_to
   282         self.relations.add(relation)
   281         self.relations.add(relation)
   283         return relation
   282         return relation
   284 
   283 
   285     def build_index(self, name, type, func=None):
   284     def commit(self):
       
   285         """this commit method do nothing by default
       
   286 
       
   287         This is voluntary to use the frequent autocommit feature in CubicWeb
       
   288         when you are using hooks or another
       
   289 
       
   290         If you want override commit method, please set it by the
       
   291         constructor
       
   292         """
       
   293         pass
       
   294 
       
   295     def rql(self, *args):
       
   296         if self._rql is not None:
       
   297             return self._rql(*args)
       
   298 
       
   299     @property
       
   300     def nb_inserted_entities(self):
       
   301         return len(self.eids)
       
   302     @property
       
   303     def nb_inserted_types(self):
       
   304         return len(self.types)
       
   305     @property
       
   306     def nb_inserted_relations(self):
       
   307         return len(self.relations)
       
   308 
       
   309     @deprecated("[3.7] index support will disappear")
       
   310     def build_index(self, name, type, func=None, can_be_empty=False):
       
   311         """build internal index for further search"""
   286         index = {}
   312         index = {}
   287         if func is None or not callable(func):
   313         if func is None or not callable(func):
   288             func = lambda x: x['eid']
   314             func = lambda x: x['eid']
   289         for eid in self.types[type]:
   315         for eid in self.types[type]:
   290             index.setdefault(func(self.eids[eid]), []).append(eid)
   316             index.setdefault(func(self.eids[eid]), []).append(eid)
   291         assert index, "new index '%s' cannot be empty" % name
   317         if not can_be_empty:
       
   318             assert index, "new index '%s' cannot be empty" % name
   292         self.indexes[name] = index
   319         self.indexes[name] = index
   293 
   320 
   294     def build_rqlindex(self, name, type, key, rql, rql_params=False, func=None):
   321     @deprecated("[3.7] index support will disappear")
       
   322     def build_rqlindex(self, name, type, key, rql, rql_params=False,
       
   323                        func=None, can_be_empty=False):
   295         """build an index by rql query
   324         """build an index by rql query
   296 
   325 
   297         rql should return eid in first column
   326         rql should return eid in first column
   298         ctl.store.build_index('index_name', 'users', 'login', 'Any U WHERE U is CWUser')
   327         ctl.store.build_index('index_name', 'users', 'login', 'Any U WHERE U is CWUser')
   299         """
   328         """
       
   329         self.types[type] = []
   300         rset = self.rql(rql, rql_params or {})
   330         rset = self.rql(rql, rql_params or {})
       
   331         if not can_be_empty:
       
   332             assert rset, "new index type '%s' cannot be empty (0 record found)" % type
   301         for entity in rset.entities():
   333         for entity in rset.entities():
   302             getattr(entity, key) # autopopulate entity with key attribute
   334             getattr(entity, key) # autopopulate entity with key attribute
   303             self.eids[entity.eid] = dict(entity)
   335             self.eids[entity.eid] = dict(entity)
   304             if entity.eid not in self.types.setdefault(type, []):
   336             if entity.eid not in self.types[type]:
   305                 self.types[type].append(entity.eid)
   337                 self.types[type].append(entity.eid)
   306         assert self.types[type], "new index type '%s' cannot be empty (0 record found)" % type
       
   307 
   338 
   308         # Build index with specified key
   339         # Build index with specified key
   309         func = lambda x: x[key]
   340         func = lambda x: x[key]
   310         self.build_index(name, type, func)
   341         self.build_index(name, type, func, can_be_empty=can_be_empty)
   311 
   342 
       
   343     @deprecated("[3.7] index support will disappear")
   312     def fetch(self, name, key, unique=False, decorator=None):
   344     def fetch(self, name, key, unique=False, decorator=None):
   313         """
   345         """index fetcher method
   314             decorator is a callable method or an iterator of callable methods (usually a lambda function)
   346 
   315             decorator=lambda x: x[:1] (first value is returned)
   347         decorator is a callable method or an iterator of callable methods (usually a lambda function)
   316 
   348         decorator=lambda x: x[:1] (first value is returned)
   317             We can use validation check function available in _entity
   349         decorator=lambda x: x.lower (lowercased value is returned)
       
   350 
       
   351         decorator is handy when you want to improve index keys but without
       
   352         changing the original field
       
   353 
       
   354         Same check functions can be reused here.
   318         """
   355         """
   319         eids = self.indexes[name].get(key, [])
   356         eids = self.indexes[name].get(key, [])
   320         if decorator is not None:
   357         if decorator is not None:
   321             if not hasattr(decorator, '__iter__'):
   358             if not hasattr(decorator, '__iter__'):
   322                 decorator = (decorator,)
   359                 decorator = (decorator,)
   323             for f in decorator:
   360             for f in decorator:
   324                 eids = f(eids)
   361                 eids = f(eids)
   325         if unique:
   362         if unique:
   326             assert len(eids) == 1, u'expected a single one value for key "%s" in index "%s". Got %i' % (key, name, len(eids))
   363             assert len(eids) == 1, u'expected a single one value for key "%s" in index "%s". Got %i' % (key, name, len(eids))
   327             eids = eids[0] # FIXME maybe it's better to keep an iterator here ?
   364             eids = eids[0]
   328         return eids
   365         return eids
   329 
   366 
       
   367     @deprecated("[3.7] index support will disappear")
   330     def find(self, type, key, value):
   368     def find(self, type, key, value):
   331         for idx in self.types[type]:
   369         for idx in self.types[type]:
   332             item = self.items[idx]
   370             item = self.items[idx]
   333             if item[key] == value:
   371             if item[key] == value:
   334                 yield item
   372                 yield item
   335 
   373 
   336     def rql(self, *args):
   374     @deprecated("[3.7] checkpoint() deprecated. use commit() instead")
   337         if self._rql is not None:
       
   338             return self._rql(*args)
       
   339 
       
   340     def checkpoint(self):
   375     def checkpoint(self):
   341         pass
   376         self.commit()
   342 
       
   343     @property
       
   344     def nb_inserted_entities(self):
       
   345         return len(self.eids)
       
   346     @property
       
   347     def nb_inserted_types(self):
       
   348         return len(self.types)
       
   349     @property
       
   350     def nb_inserted_relations(self):
       
   351         return len(self.relations)
       
   352 
       
   353     @deprecated('[3.6] get_many() deprecated. Use fetch() instead')
       
   354     def get_many(self, name, key):
       
   355         return self.fetch(name, key, unique=False)
       
   356 
       
   357     @deprecated('[3.6] get_one() deprecated. Use fetch(..., unique=True) instead')
       
   358     def get_one(self, name, key):
       
   359         return self.fetch(name, key, unique=True)
       
   360 
   377 
   361 
   378 
   362 class RQLObjectStore(ObjectStore):
   379 class RQLObjectStore(ObjectStore):
   363     """ObjectStore that works with an actual RQL repository (production mode)"""
   380     """ObjectStore that works with an actual RQL repository (production mode)"""
   364     _rql = None # bw compat
   381     _rql = None # bw compat
   365 
   382 
   366     def __init__(self, session=None, checkpoint=None):
   383     def __init__(self, session=None, commit=None):
   367         ObjectStore.__init__(self)
   384         ObjectStore.__init__(self)
   368         if session is not None:
   385         if session is not None:
   369             if not hasattr(session, 'set_pool'):
   386             if not hasattr(session, 'set_pool'):
   370                 # connection
   387                 # connection
   371                 cnx = session
   388                 cnx = session
   372                 session = session.request()
   389                 session = session.request()
   373                 session.set_pool = lambda : None
   390                 session.set_pool = lambda : None
   374                 checkpoint = checkpoint or cnx.commit
   391                 commit = commit or cnx.commit
   375             else:
   392             else:
   376                 session.set_pool()
   393                 session.set_pool()
   377             self.session = session
   394             self.session = session
   378             self._checkpoint = checkpoint or session.commit
   395             self._commit = commit or session.commit
   379         elif checkpoint is not None:
   396         elif commit is not None:
   380             self._checkpoint = checkpoint
   397             self._commit = commit
   381             # XXX .session
   398             # XXX .session
   382 
   399 
       
   400     @deprecated("[3.7] checkpoint() deprecated. use commit() instead")
   383     def checkpoint(self):
   401     def checkpoint(self):
   384         self._checkpoint()
   402         self.commit()
       
   403 
       
   404     def commit(self):
       
   405         self._commit()
   385         self.session.set_pool()
   406         self.session.set_pool()
   386 
   407 
   387     def rql(self, *args):
   408     def rql(self, *args):
   388         if self._rql is not None:
   409         if self._rql is not None:
   389             return self._rql(*args)
   410             return self._rql(*args)
   399         query = ('INSERT %s X: ' % type) + ', '.join('X %s %%(%s)s' % (k, k)
   420         query = ('INSERT %s X: ' % type) + ', '.join('X %s %%(%s)s' % (k, k)
   400                                                      for k in item)
   421                                                      for k in item)
   401         return self.rql(query, item)[0][0]
   422         return self.rql(query, item)[0][0]
   402 
   423 
   403     def relate(self, eid_from, rtype, eid_to, inlined=False):
   424     def relate(self, eid_from, rtype, eid_to, inlined=False):
   404         # if reverse relation is found, eids are exchanged
       
   405         eid_from, rtype, eid_to = super(RQLObjectStore, self).relate(
   425         eid_from, rtype, eid_to = super(RQLObjectStore, self).relate(
   406             eid_from, rtype, eid_to)
   426             eid_from, rtype, eid_to)
   407         self.rql('SET X %s Y WHERE X eid %%(x)s, Y eid %%(y)s' % rtype,
   427         self.rql('SET X %s Y WHERE X eid %%(x)s, Y eid %%(y)s' % rtype,
   408                   {'x': int(eid_from), 'y': int(eid_to)}, ('x', 'y'))
   428                   {'x': int(eid_from), 'y': int(eid_to)}, ('x', 'y'))
   409 
   429 
   458 
   478 
   459     def run(self):
   479     def run(self):
   460         self.errors = {}
   480         self.errors = {}
   461         for func, checks in self.generators:
   481         for func, checks in self.generators:
   462             self._checks = {}
   482             self._checks = {}
   463             func_name = func.__name__[4:]  # XXX
   483             func_name = func.__name__
   464             self.tell("Import '%s'..." % func_name)
   484             self.tell("Run import function '%s'..." % func_name)
   465             try:
   485             try:
   466                 func(self)
   486                 func(self)
   467             except:
   487             except:
   468                 if self.catcherrors:
   488                 if self.catcherrors:
   469                     self.record_error(func_name, 'While calling %s' % func.__name__)
   489                     self.record_error(func_name, 'While calling %s' % func.__name__)
   470                 else:
   490                 else:
       
   491                     self._print_stats()
   471                     raise
   492                     raise
   472             for key, func, title, help in checks:
   493             for key, func, title, help in checks:
   473                 buckets = self._checks.get(key)
   494                 buckets = self._checks.get(key)
   474                 if buckets:
   495                 if buckets:
   475                     err = func(buckets)
   496                     err = func(buckets)
   476                     if err:
   497                     if err:
   477                         self.errors[title] = (help, err)
   498                         self.errors[title] = (help, err)
   478         self.store.checkpoint()
   499         self.store.commit()
   479         nberrors = sum(len(err[1]) for err in self.errors.values())
   500         self._print_stats()
   480         self.tell('\nImport completed: %i entities, %i types, %i relations and %i errors'
       
   481                   % (self.store.nb_inserted_entities,
       
   482                      self.store.nb_inserted_types,
       
   483                      self.store.nb_inserted_relations,
       
   484                      nberrors))
       
   485         if self.errors:
   501         if self.errors:
   486             if self.askerror == 2 or (self.askerror and confirm('Display errors ?')):
   502             if self.askerror == 2 or (self.askerror and confirm('Display errors ?')):
   487                 from pprint import pformat
   503                 from pprint import pformat
   488                 for errkey, error in self.errors.items():
   504                 for errkey, error in self.errors.items():
   489                     self.tell("\n%s (%s): %d\n" % (error[0], errkey, len(error[1])))
   505                     self.tell("\n%s (%s): %d\n" % (error[0], errkey, len(error[1])))
   490                     self.tell(pformat(sorted(error[1])))
   506                     self.tell(pformat(sorted(error[1])))
       
   507 
       
   508     def _print_stats(self):
       
   509         nberrors = sum(len(err[1]) for err in self.errors.values())
       
   510         self.tell('\nImport statistics: %i entities, %i types, %i relations and %i errors'
       
   511                   % (self.store.nb_inserted_entities,
       
   512                      self.store.nb_inserted_types,
       
   513                      self.store.nb_inserted_relations,
       
   514                      nberrors))
   491 
   515 
   492     def get_data(self, key):
   516     def get_data(self, key):
   493         return self.data.get(key)
   517         return self.data.get(key)
   494 
   518 
   495     def index(self, name, key, value, unique=False):
   519     def index(self, name, key, value, unique=False):
   665         from cubicweb.hooks.metadata import eschema_eid
   689         from cubicweb.hooks.metadata import eschema_eid
   666         eids = []
   690         eids = []
   667         for eschema in entity.e_schema.ancestors() + [entity.e_schema]:
   691         for eschema in entity.e_schema.ancestors() + [entity.e_schema]:
   668             eids.append(eschema_eid(self.session, eschema))
   692             eids.append(eschema_eid(self.session, eschema))
   669         return eids
   693         return eids
   670 
       
   671 
       
   672 ################################################################################
       
   673 
       
   674 utf8csvreader = deprecated('[3.6] use ucsvreader instead')(ucsvreader)
       
   675 
       
   676 @deprecated('[3.6] use required')
       
   677 def nonempty(value):
       
   678     return required(value)
       
   679 
       
   680 @deprecated("[3.6] use call_check_method('isdigit')")
       
   681 def alldigits(txt):
       
   682     if txt.isdigit():
       
   683         return txt
       
   684     else:
       
   685         return u''
       
   686 
       
   687 @deprecated("[3.7] too specific, will move away, copy me")
       
   688 def capitalize_if_unicase(txt):
       
   689     if txt.isupper() or txt.islower():
       
   690         return txt.capitalize()
       
   691     return txt
       
   692 
       
   693 @deprecated("[3.7] too specific, will move away, copy me")
       
   694 def yesno(value):
       
   695     """simple heuristic that returns boolean value
       
   696 
       
   697     >>> yesno("Yes")
       
   698     True
       
   699     >>> yesno("oui")
       
   700     True
       
   701     >>> yesno("1")
       
   702     True
       
   703     >>> yesno("11")
       
   704     True
       
   705     >>> yesno("")
       
   706     False
       
   707     >>> yesno("Non")
       
   708     False
       
   709     >>> yesno("blablabla")
       
   710     False
       
   711     """
       
   712     if value:
       
   713         return value.lower()[0] in 'yo1'
       
   714     return False
       
   715 
       
   716 @deprecated("[3.7] use call_check_method('isalpha')")
       
   717 def isalpha(value):
       
   718     if value.isalpha():
       
   719         return value
       
   720     raise ValueError("not all characters in the string alphabetic")
       
   721 
       
   722 @deprecated("[3.7] use call_transform_method('upper')")
       
   723 def uppercase(txt):
       
   724     return txt.upper()
       
   725 
       
   726 @deprecated("[3.7] use call_transform_method('lower')")
       
   727 def lowercase(txt):
       
   728     return txt.lower()
       
   729 
       
   730 @deprecated("[3.7] use call_transform_method('replace', ' ', '')")
       
   731 def no_space(txt):
       
   732     return txt.replace(' ','')
       
   733 
       
   734 @deprecated("[3.7] use call_transform_method('replace', u'\xa0', '')")
       
   735 def no_uspace(txt):
       
   736     return txt.replace(u'\xa0','')
       
   737 
       
   738 @deprecated("[3.7] use call_transform_method('replace', '-', '')")
       
   739 def no_dash(txt):
       
   740     return txt.replace('-','')
       
   741 
       
   742 @deprecated("[3.7] use call_transform_method('strip')")
       
   743 def strip(txt):
       
   744     return txt.strip()
       
   745 
       
   746 @deprecated("[3.7] use call_transform_method('replace', ',', '.'), float")
       
   747 def decimal(value):
       
   748     return comma_float(value)
       
   749 
       
   750 @deprecated('[3.7] use int builtin')
       
   751 def integer(value):
       
   752     return int(value)