le patch massiveimport a été importé stable
authorSylvain Thénault <sylvain.thenault@logilab.fr>
Fri, 05 Mar 2010 18:07:39 +0100
branchstable
changeset 4818 9f9bfbcdecfd
parent 4816 c02583cb80a9
child 4822 ededce6779b5
le patch massiveimport a été importé
devtools/dataimport.py
server/sources/native.py
--- a/devtools/dataimport.py	Fri Mar 05 17:24:01 2010 +0100
+++ b/devtools/dataimport.py	Fri Mar 05 18:07:39 2010 +0100
@@ -59,10 +59,14 @@
 import traceback
 import os.path as osp
 from StringIO import StringIO
+from copy import copy
 
 from logilab.common import shellutils
+from logilab.common.date import strptime
+from logilab.common.decorators import cached
 from logilab.common.deprecation import deprecated
 
+
 def ucsvreader_pb(filepath, encoding='utf-8', separator=',', quote='"',
                   skipfirst=False, withpb=True):
     """same as ucsvreader but a progress bar is displayed as we iter on rows"""
@@ -90,8 +94,6 @@
     for row in it:
         yield [item.decode(encoding) for item in row]
 
-utf8csvreader = deprecated('use ucsvreader instead')(ucsvreader)
-
 def commit_every(nbit, store, it):
     for i, x in enumerate(it):
         yield x
@@ -129,19 +131,16 @@
     assert isinstance(row, dict)
     assert isinstance(map, list)
     for src, dest, funcs in map:
-        assert not (required in funcs and optional in funcs), "optional and required checks are exclusive"
+        assert not (required in funcs and optional in funcs), \
+               "optional and required checks are exclusive"
         res[dest] = row[src]
         try:
             for func in funcs:
                 res[dest] = func(res[dest])
-            if res[dest] is None:
-                raise AssertionError('undetermined value')
-        except AssertionError, err:
-            if optional in funcs:
-                # Forget this field if exception is coming from optional function
-               del res[dest]
-            else:
-               raise AssertionError('error with "%s" field: %s' % (src, err))
+                if res[dest] is None:
+                    break
+        except ValueError, err:
+            raise ValueError('error with %r field: %s' % (src, err))
     return res
 
 
@@ -178,98 +177,49 @@
                 return True # silent
 
 
-# base sanitizing functions ####################################################
-
-def capitalize_if_unicase(txt):
-    if txt.isupper() or txt.islower():
-        return txt.capitalize()
-    return txt
-
-def uppercase(txt):
-    return txt.upper()
-
-def lowercase(txt):
-    return txt.lower()
-
-def no_space(txt):
-    return txt.replace(' ','')
-
-def no_uspace(txt):
-    return txt.replace(u'\xa0','')
-
-def no_dash(txt):
-    return txt.replace('-','')
-
-def decimal(value):
-    """cast to float but with comma replacement
-
-    We take care of some locale format as replacing ',' by '.'"""
-    value = value.replace(',', '.')
-    try:
-        return float(value)
-    except Exception, err:
-        raise AssertionError(err)
-
-def integer(value):
-    try:
-        return int(value)
-    except Exception, err:
-        raise AssertionError(err)
-
-def strip(txt):
-    return txt.strip()
-
-def yesno(value):
-    """simple heuristic that returns boolean value
-
-    >>> yesno("Yes")
-    True
-    >>> yesno("oui")
-    True
-    >>> yesno("1")
-    True
-    >>> yesno("11")
-    True
-    >>> yesno("")
-    False
-    >>> yesno("Non")
-    False
-    >>> yesno("blablabla")
-    False
-    """
-    if value:
-        return value.lower()[0] in 'yo1'
-    return False
-
-def isalpha(value):
-    if value.isalpha():
-        return value
-    raise AssertionError("not all characters in the string alphabetic")
+# base sanitizing/coercing functions ###########################################
 
 def optional(value):
     """validation error will not been raised if you add this checker in chain"""
-    return value
+    if value:
+        return value
+    return None
 
 def required(value):
-    """raise AssertionError is value is empty
+    """raise ValueError is value is empty
 
     This check should be often found in last position in the chain.
     """
-    if bool(value):
+    if value:
         return value
-    raise AssertionError("required")
+    raise ValueError("required")
 
-@deprecated('use required(value)')
-def nonempty(value):
-    return required(value)
+def todatetime(format='%d/%m/%Y'):
+    """return a transformation function to turn string input value into a
+    `datetime.datetime` instance, using given format.
+
+    Follow it by `todate` or `totime` functions from `logilab.common.date` if
+    you want a `date`/`time` instance instead of `datetime`.
+    """
+    def coerce(value):
+        return strptime(value, format)
+    return coerce
 
-@deprecated('use integer(value)')
-def alldigits(txt):
-    if txt.isdigit():
-        return txt
-    else:
-        return u''
+def call_transform_method(methodname, *args, **kwargs):
+    """return value returned by calling the given method on input"""
+    def coerce(value):
+        return getattr(value, methodname)(*args, **kwargs)
+    return coerce
 
+def call_check_method(methodname, *args, **kwargs):
+    """check value returned by calling the given method on input is true,
+    else raise ValueError
+    """
+    def check(value):
+        if getattr(value, methodname)(*args, **kwargs):
+            return value
+        raise ValueError('%s not verified on %r' % (methodname, value))
+    return check
 
 # base integrity checking functions ############################################
 
@@ -316,7 +266,7 @@
         self.eids[eid] = item
         self.types.setdefault(type, []).append(eid)
 
-    def relate(self, eid_from, rtype, eid_to):
+    def relate(self, eid_from, rtype, eid_to, inlined=False):
         """Add new relation (reverse type support is available)
 
         >>> 1,2 = eid_from, eid_to
@@ -359,14 +309,6 @@
         func = lambda x: x[key]
         self.build_index(name, type, func)
 
-    @deprecated('get_many() deprecated. Use fetch() instead')
-    def get_many(self, name, key):
-        return self.fetch(name, key, unique=False)
-
-    @deprecated('get_one() deprecated. Use fetch(..., unique=True) instead')
-    def get_one(self, name, key):
-        return self.fetch(name, key, unique=True)
-
     def fetch(self, name, key, unique=False, decorator=None):
         """
             decorator is a callable method or an iterator of callable methods (usually a lambda function)
@@ -398,6 +340,24 @@
     def checkpoint(self):
         pass
 
+    @property
+    def nb_inserted_entities(self):
+        return len(self.eids)
+    @property
+    def nb_inserted_types(self):
+        return len(self.types)
+    @property
+    def nb_inserted_relations(self):
+        return len(self.relations)
+
+    @deprecated('[3.6] get_many() deprecated. Use fetch() instead')
+    def get_many(self, name, key):
+        return self.fetch(name, key, unique=False)
+
+    @deprecated('[3.6] get_one() deprecated. Use fetch(..., unique=True) instead')
+    def get_one(self, name, key):
+        return self.fetch(name, key, unique=True)
+
 
 class RQLObjectStore(ObjectStore):
     """ObjectStore that works with an actual RQL repository (production mode)"""
@@ -412,19 +372,24 @@
                 session = session.request()
                 session.set_pool = lambda : None
                 checkpoint = checkpoint or cnx.commit
+            else:
+                session.set_pool()
             self.session = session
-            self.checkpoint = checkpoint or session.commit
+            self._checkpoint = checkpoint or session.commit
         elif checkpoint is not None:
-            self.checkpoint = checkpoint
+            self._checkpoint = checkpoint
+            # XXX .session
+
+    def checkpoint(self):
+        self._checkpoint()
+        self.session.set_pool()
 
     def rql(self, *args):
         if self._rql is not None:
             return self._rql(*args)
-        self.session.set_pool()
         return self.session.execute(*args)
 
     def create_entity(self, *args, **kwargs):
-        self.session.set_pool()
         entity = self.session.create_entity(*args, **kwargs)
         self.eids[entity.eid] = entity
         self.types.setdefault(args[0], []).append(entity.eid)
@@ -435,7 +400,7 @@
                                                      for k in item)
         return self.rql(query, item)[0][0]
 
-    def relate(self, eid_from, rtype, eid_to):
+    def relate(self, eid_from, rtype, eid_to, inlined=False):
         # if reverse relation is found, eids are exchanged
         eid_from, rtype, eid_to = super(RQLObjectStore, self).relate(
             eid_from, rtype, eid_to)
@@ -513,8 +478,10 @@
         self.store.checkpoint()
         nberrors = sum(len(err[1]) for err in self.errors.values())
         self.tell('\nImport completed: %i entities, %i types, %i relations and %i errors'
-                  % (len(self.store.eids), len(self.store.types),
-                     len(self.store.relations), nberrors))
+                  % (self.store.nb_inserted_entities,
+                     self.store.nb_inserted_types,
+                     self.store.nb_inserted_relations,
+                     nberrors))
         if self.errors:
             if self.askerror == 2 or (self.askerror and confirm('Display errors ?')):
                 from pprint import pformat
@@ -545,3 +512,241 @@
     def iter_and_commit(self, datakey):
         """iter rows, triggering commit every self.commitevery iterations"""
         return commit_every(self.commitevery, self.store, self.get_data(datakey))
+
+
+
+from datetime import datetime
+from cubicweb.schema import META_RTYPES, VIRTUAL_RTYPES
+
+
+class NoHookRQLObjectStore(RQLObjectStore):
+    """ObjectStore that works with an actual RQL repository (production mode)"""
+    _rql = None # bw compat
+
+    def __init__(self, session, metagen=None, baseurl=None):
+        super(NoHookRQLObjectStore, self).__init__(session)
+        self.source = session.repo.system_source
+        self.rschema = session.repo.schema.rschema
+        self.add_relation = self.source.add_relation
+        if metagen is None:
+            metagen = MetaGenerator(session, baseurl)
+        self.metagen = metagen
+        self._nb_inserted_entities = 0
+        self._nb_inserted_types = 0
+        self._nb_inserted_relations = 0
+        self.rql = session.unsafe_execute
+
+    def create_entity(self, etype, **kwargs):
+        for k, v in kwargs.iteritems():
+            kwargs[k] = getattr(v, 'eid', v)
+        entity, rels = self.metagen.base_etype_dicts(etype)
+        entity = copy(entity)
+        entity._related_cache = {}
+        self.metagen.init_entity(entity)
+        entity.update(kwargs)
+        session = self.session
+        self.source.add_entity(session, entity)
+        self.source.add_info(session, entity, self.source, complete=False)
+        for rtype, targeteids in rels.iteritems():
+            # targeteids may be a single eid or a list of eids
+            inlined = self.rschema(rtype).inlined
+            try:
+                for targeteid in targeteids:
+                    self.add_relation(session, entity.eid, rtype, targeteid,
+                                      inlined)
+            except TypeError:
+                self.add_relation(session, entity.eid, rtype, targeteids,
+                                  inlined)
+        self._nb_inserted_entities += 1
+        return entity
+
+    def relate(self, eid_from, rtype, eid_to):
+        assert not rtype.startswith('reverse_')
+        self.add_relation(self.session, eid_from, rtype, eid_to,
+                          self.rschema(rtype).inlined)
+        self._nb_inserted_relations += 1
+
+    @property
+    def nb_inserted_entities(self):
+        return self._nb_inserted_entities
+    @property
+    def nb_inserted_types(self):
+        return self._nb_inserted_types
+    @property
+    def nb_inserted_relations(self):
+        return self._nb_inserted_relations
+
+    def _put(self, type, item):
+        raise RuntimeError('use create entity')
+
+
+class MetaGenerator(object):
+    def __init__(self, session, baseurl=None):
+        self.session = session
+        self.source = session.repo.system_source
+        self.time = datetime.now()
+        if baseurl is None:
+            config = session.vreg.config
+            baseurl = config['base-url'] or config.default_base_url()
+        if not baseurl[-1] == '/':
+            baseurl += '/'
+        self.baseurl =  baseurl
+        # attributes/relations shared by all entities of the same type
+        self.etype_attrs = []
+        self.etype_rels = []
+        # attributes/relations specific to each entity
+        self.entity_attrs = ['eid', 'cwuri']
+        #self.entity_rels = [] XXX not handled (YAGNI?)
+        schema = session.vreg.schema
+        rschema = schema.rschema
+        for rtype in META_RTYPES:
+            if rtype in ('eid', 'cwuri') or rtype in VIRTUAL_RTYPES:
+                continue
+            if rschema(rtype).final:
+                self.etype_attrs.append(rtype)
+            else:
+                self.etype_rels.append(rtype)
+        if not schema._eid_index:
+            # test schema loaded from the fs
+            self.gen_is = self.test_gen_is
+            self.gen_is_instance_of = self.test_gen_is_instanceof
+
+    @cached
+    def base_etype_dicts(self, etype):
+        entity = self.session.vreg['etypes'].etype_class(etype)(self.session)
+        # entity are "surface" copied, avoid shared dict between copies
+        del entity.cw_extra_kwargs
+        for attr in self.etype_attrs:
+            entity[attr] = self.generate(entity, attr)
+        rels = {}
+        for rel in self.etype_rels:
+            rels[rel] = self.generate(entity, rel)
+        return entity, rels
+
+    def init_entity(self, entity):
+        for attr in self.entity_attrs:
+            entity[attr] = self.generate(entity, attr)
+        entity.eid = entity['eid']
+
+    def generate(self, entity, rtype):
+        return getattr(self, 'gen_%s' % rtype)(entity)
+
+    def gen_eid(self, entity):
+        return self.source.create_eid(self.session)
+
+    def gen_cwuri(self, entity):
+        return u'%seid/%s' % (self.baseurl, entity['eid'])
+
+    def gen_creation_date(self, entity):
+        return self.time
+    def gen_modification_date(self, entity):
+        return self.time
+
+    def gen_is(self, entity):
+        return entity.e_schema.eid
+    def gen_is_instance_of(self, entity):
+        eids = []
+        for etype in entity.e_schema.ancestors() + [entity.e_schema]:
+            eids.append(entity.e_schema.eid)
+        return eids
+
+    def gen_created_by(self, entity):
+        return self.session.user.eid
+    def gen_owned_by(self, entity):
+        return self.session.user.eid
+
+    # implementations of gen_is / gen_is_instance_of to use during test where
+    # schema has been loaded from the fs (hence entity type schema eids are not
+    # known)
+    def test_gen_is(self, entity):
+        from cubicweb.hooks.metadata import eschema_eid
+        return eschema_eid(self.session, entity.e_schema)
+    def test_gen_is_instanceof(self, entity):
+        from cubicweb.hooks.metadata import eschema_eid
+        eids = []
+        for eschema in entity.e_schema.ancestors() + [entity.e_schema]:
+            eids.append(eschema_eid(self.session, eschema))
+        return eids
+
+
+################################################################################
+
+utf8csvreader = deprecated('[3.6] use ucsvreader instead')(ucsvreader)
+
+@deprecated('[3.6] use required')
+def nonempty(value):
+    return required(value)
+
+@deprecated("[3.6] use call_check_method('isdigit')")
+def alldigits(txt):
+    if txt.isdigit():
+        return txt
+    else:
+        return u''
+
+@deprecated("[3.7] too specific, will move away, copy me")
+def capitalize_if_unicase(txt):
+    if txt.isupper() or txt.islower():
+        return txt.capitalize()
+    return txt
+
+@deprecated("[3.7] too specific, will move away, copy me")
+def yesno(value):
+    """simple heuristic that returns boolean value
+
+    >>> yesno("Yes")
+    True
+    >>> yesno("oui")
+    True
+    >>> yesno("1")
+    True
+    >>> yesno("11")
+    True
+    >>> yesno("")
+    False
+    >>> yesno("Non")
+    False
+    >>> yesno("blablabla")
+    False
+    """
+    if value:
+        return value.lower()[0] in 'yo1'
+    return False
+
+@deprecated("[3.7] use call_check_method('isalpha')")
+def isalpha(value):
+    if value.isalpha():
+        return value
+    raise ValueError("not all characters in the string alphabetic")
+
+@deprecated("[3.7] use call_transform_method('upper')")
+def uppercase(txt):
+    return txt.upper()
+
+@deprecated("[3.7] use call_transform_method('lower')")
+def lowercase(txt):
+    return txt.lower()
+
+@deprecated("[3.7] use call_transform_method('replace', ' ', '')")
+def no_space(txt):
+    return txt.replace(' ','')
+
+@deprecated("[3.7] use call_transform_method('replace', u'\xa0', '')")
+def no_uspace(txt):
+    return txt.replace(u'\xa0','')
+
+@deprecated("[3.7] use call_transform_method('replace', '-', '')")
+def no_dash(txt):
+    return txt.replace('-','')
+
+@deprecated("[3.7] use call_transform_method('strip')")
+def strip(txt):
+    return txt.strip()
+
+@deprecated("[3.7] use call_transform_method('replace', ',', '.'), float")
+def decimal(value):
+    return comma_float(value)
+
+@deprecated('[3.7] use int builtin')
+def integer(value):
+    return int(value)
--- a/server/sources/native.py	Fri Mar 05 17:24:01 2010 +0100
+++ b/server/sources/native.py	Fri Mar 05 18:07:39 2010 +0100
@@ -386,7 +386,8 @@
     def update_entity(self, session, entity):
         """replace an entity in the source"""
         attrs = self.preprocess_entity(entity)
-        sql = self.sqlgen.update(SQL_PREFIX + str(entity.e_schema), attrs, [SQL_PREFIX + 'eid'])
+        sql = self.sqlgen.update(SQL_PREFIX + str(entity.e_schema), attrs,
+                                 [SQL_PREFIX + 'eid'])
         self.doexec(session, sql, attrs)
 
     def delete_entity(self, session, etype, eid):
@@ -395,10 +396,16 @@
         sql = self.sqlgen.delete(SQL_PREFIX + etype, attrs)
         self.doexec(session, sql, attrs)
 
-    def add_relation(self, session, subject, rtype, object):
+    def add_relation(self, session, subject, rtype, object, inlined=False):
         """add a relation to the source"""
-        attrs = {'eid_from': subject, 'eid_to': object}
-        sql = self.sqlgen.insert('%s_relation' % rtype, attrs)
+        if inlined is False:
+            attrs = {'eid_from': subject, 'eid_to': object}
+            sql = self.sqlgen.insert('%s_relation' % rtype, attrs)
+        else: # used by data import
+            etype = session.describe(subject)[0]
+            attrs = {SQL_PREFIX + 'eid': subject, SQL_PREFIX + rtype: object}
+            sql = self.sqlgen.update(SQL_PREFIX + etype, attrs,
+                                     [SQL_PREFIX + 'eid'])
         self.doexec(session, sql, attrs)
 
     def delete_relation(self, session, subject, rtype, object):