dataimport/importer.py
changeset 10461 37644c518705
parent 10460 d260722f2453
child 10514 b29d9904482e
equal deleted inserted replaced
10460:d260722f2453 10461:37644c518705
    11 # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
    11 # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
    12 # details.
    12 # details.
    13 #
    13 #
    14 # You should have received a copy of the GNU Lesser General Public License along
    14 # You should have received a copy of the GNU Lesser General Public License along
    15 # with this program. If not, see <http://www.gnu.org/licenses/>.
    15 # with this program. If not, see <http://www.gnu.org/licenses/>.
    16 """This module contains tools to programmatically import external data into CubicWeb. It's designed
    16 """Data import of external entities.
    17 on top of the store concept to leverage possibility of code sharing accross various data import
    17 
    18 needs.
    18 Main entry points:
    19 
    19 
    20 The following classes are defined:
    20 .. autoclass:: ExtEntitiesImporter
    21 
    21 .. autoclass:: ExtEntity
    22 * :class:`ExtEntity`: some intermediate representation of data to import, using external identifier
    22 
    23   but no eid,
    23 Utilities:
    24 
    24 
    25 * :class:`ExtEntitiesImporter`: class responsible for turning ExtEntity's extid to eid, and create
    25 .. autofunction:: cwuri2eid
    26   or update CubicWeb entities accordingly (using a Store).
    26 .. autoclass:: RelationMapping
    27 
       
    28 What is left to do is to write a class or a function that will yield external entities from some
       
    29 data source (eg RDF, CSV) which will be case dependant (the *generator*).  You may then plug
       
    30 arbitrary filters into the external entities stream between the generator and the importer, allowing
       
    31 to have some generic generators whose generated content is rafined by specific filters.
       
    32 
       
    33 .. code-block:: python
       
    34 
       
    35     ext_entities = fetch(<source>) # function yielding external entities
       
    36     log = SimpleImportLog('<source file/url/whatever>')
       
    37     importer = ExtEntitiesImporter(cnx, store, import_log=log)
       
    38     importer.import_entities(ext_entities)
       
    39 
       
    40 Here are the two classes that you'll have to deal with, and maybe to override:
       
    41 
       
    42 .. autoclass:: cubicweb.dataimport.importer.ExtEntitiesImporter
       
    43 .. autoclass:: cubicweb.dataimport.importer.ExtEntity
       
    44 """
    27 """
    45 
    28 
    46 from collections import defaultdict
    29 from collections import defaultdict
    47 import logging
    30 import logging
    48 
    31 
    91     """Transitional representation of an entity for use in data importer.
    74     """Transitional representation of an entity for use in data importer.
    92 
    75 
    93     An external entity has the following properties:
    76     An external entity has the following properties:
    94 
    77 
    95     * ``extid`` (external id), an identifier for the ext entity,
    78     * ``extid`` (external id), an identifier for the ext entity,
       
    79 
    96     * ``etype`` (entity type), a string which must be the name of one entity type in the schema
    80     * ``etype`` (entity type), a string which must be the name of one entity type in the schema
    97       (eg. ``'Person'``, ``'Animal'``, ...),
    81       (eg. ``'Person'``, ``'Animal'``, ...),
       
    82 
    98     * ``values``, a dictionary whose keys are attribute or relation names from the schema (eg.
    83     * ``values``, a dictionary whose keys are attribute or relation names from the schema (eg.
    99       ``'first_name'``, ``'friend'``), and whose values are *sets*
    84       ``'first_name'``, ``'friend'``), and whose values are *sets*
   100 
    85 
   101     For instance:
    86     For instance:
   102 
    87 
   103     ..code-block::python
    88     .. code-block:: python
   104 
    89 
   105         ext_entity.extid = 'http://example.org/person/debby'
    90         ext_entity.extid = 'http://example.org/person/debby'
   106         ext_entity.etype = 'Person'
    91         ext_entity.etype = 'Person'
   107         ext_entity.values = {'first_name': set([u"Deborah", u"Debby"]),
    92         ext_entity.values = {'first_name': set([u"Deborah", u"Debby"]),
   108                             'friend': set(['http://example.org/person/john'])}
    93                             'friend': set(['http://example.org/person/john'])}
   206 
   191 
   207 class ExtEntitiesImporter(object):
   192 class ExtEntitiesImporter(object):
   208     """This class is responsible for importing externals entities, that is instances of
   193     """This class is responsible for importing externals entities, that is instances of
   209     :class:`ExtEntity`, into CubicWeb entities.
   194     :class:`ExtEntity`, into CubicWeb entities.
   210 
   195 
   211     Parameters:
   196     :param schema: the CubicWeb's instance schema
   212 
   197     :param store: a CubicWeb `Store`
   213     * `schema`: the CubicWeb's instance schema
   198     :param extid2eid: optional {extid: eid} dictionary giving information on existing entities. It
   214 
   199         will be completed during import. You may want to use :func:`cwuri2eid` to build it.
   215     * `store`: a CubicWeb `Store`
   200     :param existing_relation: optional {rtype: set((subj eid, obj eid))} mapping giving information on
   216 
   201         existing relations of a given type. You may want to use :class:`RelationMapping` to build it.
   217     * `extid2eid`: optional {extid: eid} dictionary giving information on existing entities. It
   202     :param  etypes_order_hint: optional ordered iterable on entity types, giving an hint on the order in
   218     will be completed during import. You may want to use :func:`cwuri2eid` to build it.
   203         which they should be attempted to be imported
   219 
   204     :param  import_log: optional object implementing the :class:`SimpleImportLog` interface to record
   220     * `existing_relation`: optional {rtype: set((subj eid, obj eid))} mapping giving information on
   205         events occuring during the import
   221     existing relations of a given type. You may want to use :class:`RelationMapping` to build it.
   206     :param  raise_on_error: optional boolean flag - default to false, indicating whether errors should
   222 
   207         be raised or logged. You usually want them to be raised during test but to be logged in
   223     * `etypes_order_hint`: optional ordered iterable on entity types, giving an hint on the order in
   208         production.
   224       which they should be attempted to be imported
   209 
   225 
   210     Instances of this class are meant to import external entities through :meth:`import_entities`
   226     * `import_log`: optional object implementing the :class:`SimpleImportLog` interface to record
   211     which handles a stream of :class:`ExtEntity`. One may then plug arbitrary filters into the
   227       events occuring during the import
   212     external entities stream.
   228 
   213 
   229     * `raise_on_error`: optional boolean flag - default to false, indicating whether errors should
   214     .. automethod:: import_entities
   230       be raised or logged. You usually want them to be raised during test but to be logged in
   215 
   231       production.
       
   232     """
   216     """
   233 
   217 
   234     def __init__(self, schema, store, extid2eid=None, existing_relations=None,
   218     def __init__(self, schema, store, extid2eid=None, existing_relations=None,
   235                  etypes_order_hint=(), import_log=None, raise_on_error=False):
   219                  etypes_order_hint=(), import_log=None, raise_on_error=False):
   236         self.schema = schema
   220         self.schema = schema