[dataimport] add a filter function to not fail if some extentity has several values for an attribute of final relation
authorSylvain Thénault <sylvain.thenault@logilab.fr>
Wed, 03 Feb 2016 11:13:51 +0100
changeset 11139 df928a3a94e3
parent 11138 78c8e64f3cef
child 11140 fabcd1c6dcd1
[dataimport] add a filter function to not fail if some extentity has several values for an attribute of final relation the function will simply record a warning in the import log and keep a value randomly. Notice the function has to be explicitly inserted in the transformation stream.
cubicweb/dataimport/importer.py
cubicweb/dataimport/test/unittest_importer.py
--- a/cubicweb/dataimport/importer.py	Wed Feb 03 11:12:09 2016 +0100
+++ b/cubicweb/dataimport/importer.py	Wed Feb 03 11:13:51 2016 +0100
@@ -1,4 +1,4 @@
-# copyright 2015 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# copyright 2015-2016 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
 # contact http://www.logilab.fr -- mailto:contact@logilab.fr
 #
 # This program is free software: you can redistribute it and/or modify it under
@@ -74,6 +74,37 @@
     return use_extid_as_cwuri_filter
 
 
+def drop_extra_values(extentities, schema, import_log):
+    """Return a generator of :class:`ExtEntity` objects that will ensure their attributes and
+    inlined relations have a single value. When it's not the case, a warning will be recorded in
+    the import log and one value among other will be kept (randomly).
+
+    `schema` is the instance's schema, `import_log` is an instance of a class implementing the
+    :class:`SimpleImportLog` interface.
+
+    Example usage:
+
+    .. code-block:: python
+
+        importer = ExtEntitiesImporter(schema, store, import_log)
+        importer.import_entities(drop_extra_values(extentities, schema, import_log))
+
+    """
+    _get_rschema = schema.rschema
+    for extentity in extentities:
+        entity_dict = extentity.values
+        for key, rtype, role in extentity.iter_rdefs():
+            rschema = _get_rschema(rtype)
+            if (rschema.final or (rschema.inlined and role == 'subject')) \
+               and len(entity_dict[key]) > 1:
+                values = ', '.join(repr(v) for v in entity_dict[key])
+                import_log.record_warning(
+                    "more than one value for attribute %r, only one will be kept: %s"
+                    % (rtype, values), path=extentity.extid)
+                entity_dict[key] = set([entity_dict[key].pop()])
+        yield extentity
+
+
 class RelationMapping(object):
     """Read-only mapping from relation type to set of related (subject, object) eids.
 
@@ -161,6 +192,8 @@
         Return a list of non inlined relations that may be inserted later, each relations defined by
         a 3-tuple (subject extid, relation type, object extid).
 
+        The instance's schema is given as argument.
+
         Take care the importer may call this method several times.
         """
         assert self._schema is None, 'prepare() has already been called for %s' % self
--- a/cubicweb/dataimport/test/unittest_importer.py	Wed Feb 03 11:12:09 2016 +0100
+++ b/cubicweb/dataimport/test/unittest_importer.py	Wed Feb 03 11:13:51 2016 +0100
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# copyright 2015 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# copyright 2015-2016 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
 # contact http://www.logilab.fr -- mailto:contact@logilab.fr
 #
 # This program is free software: you can redistribute it and/or modify it under
@@ -16,15 +16,13 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 """Tests for cubicweb.dataimport.importer"""
 
-from collections import defaultdict
-
 from logilab.common.testlib import TestCase, unittest_main
 
 from cubicweb import ValidationError
 from cubicweb.devtools.testlib import CubicWebTC
 from cubicweb.dataimport import RQLObjectStore, ucsvreader
-from cubicweb.dataimport.importer import (ExtEntity, ExtEntitiesImporter, SimpleImportLog,
-                                          RelationMapping, use_extid_as_cwuri)
+from cubicweb.dataimport.importer import (ExtEntity, ExtEntitiesImporter, RelationMapping,
+                                          SimpleImportLog, use_extid_as_cwuri, drop_extra_values)
 
 
 class RelationMappingTC(CubicWebTC):
@@ -165,6 +163,21 @@
         self.assertNotIn('cwuri', personne.values)
 
 
+class DropExtraValuesTC(CubicWebTC):
+
+    def test(self):
+        personne = ExtEntity('Personne', b'1', {'nom': set([u'de la lune', 'di la luna']),
+                                                'prenom': set([u'Jean']),
+                                                'enfant': set('23'),
+                                                'connait': set('45')})
+        log = SimpleImportLog('<unspecified>')
+        list(drop_extra_values((personne,), self.schema, log))
+        self.assertEqual(len(personne.values['nom']), 1)
+        self.assertEqual(len(personne.values['enfant']), 1)
+        self.assertEqual(len(personne.values['connait']), 2)
+        self.assertEqual(len(log.logs), 2)
+
+
 def extentities_from_csv(fpath):
     """Yield ExtEntity read from `fpath` CSV file."""
     with open(fpath, 'rb') as f: