# HG changeset patch # User Denis Laxalde # Date 1435327833 -7200 # Node ID 37644c5187055a0d6f6d125a09169dca89f45432 # Parent d260722f2453ae50e2accbc72510854334dde2da [doc] Add a tutorial and extend documentation for ExtEntityImporter Related to #5414753. diff -r d260722f2453 -r 37644c518705 dataimport/importer.py --- a/dataimport/importer.py Fri Jun 26 16:09:27 2015 +0200 +++ b/dataimport/importer.py Fri Jun 26 16:10:33 2015 +0200 @@ -13,34 +13,17 @@ # # You should have received a copy of the GNU Lesser General Public License along # with this program. If not, see . -"""This module contains tools to programmatically import external data into CubicWeb. It's designed -on top of the store concept to leverage possibility of code sharing accross various data import -needs. - -The following classes are defined: +"""Data import of external entities. -* :class:`ExtEntity`: some intermediate representation of data to import, using external identifier - but no eid, - -* :class:`ExtEntitiesImporter`: class responsible for turning ExtEntity's extid to eid, and create - or update CubicWeb entities accordingly (using a Store). +Main entry points: -What is left to do is to write a class or a function that will yield external entities from some -data source (eg RDF, CSV) which will be case dependant (the *generator*). You may then plug -arbitrary filters into the external entities stream between the generator and the importer, allowing -to have some generic generators whose generated content is rafined by specific filters. - -.. code-block:: python +.. autoclass:: ExtEntitiesImporter +.. autoclass:: ExtEntity - ext_entities = fetch() # function yielding external entities - log = SimpleImportLog('') - importer = ExtEntitiesImporter(cnx, store, import_log=log) - importer.import_entities(ext_entities) +Utilities: -Here are the two classes that you'll have to deal with, and maybe to override: - -.. autoclass:: cubicweb.dataimport.importer.ExtEntitiesImporter -.. autoclass:: cubicweb.dataimport.importer.ExtEntity +.. autofunction:: cwuri2eid +.. autoclass:: RelationMapping """ from collections import defaultdict @@ -93,14 +76,16 @@ An external entity has the following properties: * ``extid`` (external id), an identifier for the ext entity, + * ``etype`` (entity type), a string which must be the name of one entity type in the schema (eg. ``'Person'``, ``'Animal'``, ...), + * ``values``, a dictionary whose keys are attribute or relation names from the schema (eg. ``'first_name'``, ``'friend'``), and whose values are *sets* For instance: - ..code-block::python + .. code-block:: python ext_entity.extid = 'http://example.org/person/debby' ext_entity.etype = 'Person' @@ -208,27 +193,26 @@ """This class is responsible for importing externals entities, that is instances of :class:`ExtEntity`, into CubicWeb entities. - Parameters: - - * `schema`: the CubicWeb's instance schema - - * `store`: a CubicWeb `Store` - - * `extid2eid`: optional {extid: eid} dictionary giving information on existing entities. It - will be completed during import. You may want to use :func:`cwuri2eid` to build it. + :param schema: the CubicWeb's instance schema + :param store: a CubicWeb `Store` + :param extid2eid: optional {extid: eid} dictionary giving information on existing entities. It + will be completed during import. You may want to use :func:`cwuri2eid` to build it. + :param existing_relation: optional {rtype: set((subj eid, obj eid))} mapping giving information on + existing relations of a given type. You may want to use :class:`RelationMapping` to build it. + :param etypes_order_hint: optional ordered iterable on entity types, giving an hint on the order in + which they should be attempted to be imported + :param import_log: optional object implementing the :class:`SimpleImportLog` interface to record + events occuring during the import + :param raise_on_error: optional boolean flag - default to false, indicating whether errors should + be raised or logged. You usually want them to be raised during test but to be logged in + production. - * `existing_relation`: optional {rtype: set((subj eid, obj eid))} mapping giving information on - existing relations of a given type. You may want to use :class:`RelationMapping` to build it. - - * `etypes_order_hint`: optional ordered iterable on entity types, giving an hint on the order in - which they should be attempted to be imported + Instances of this class are meant to import external entities through :meth:`import_entities` + which handles a stream of :class:`ExtEntity`. One may then plug arbitrary filters into the + external entities stream. - * `import_log`: optional object implementing the :class:`SimpleImportLog` interface to record - events occuring during the import + .. automethod:: import_entities - * `raise_on_error`: optional boolean flag - default to false, indicating whether errors should - be raised or logged. You usually want them to be raised during test but to be logged in - production. """ def __init__(self, schema, store, extid2eid=None, existing_relations=None, diff -r d260722f2453 -r 37644c518705 doc/book/en/devrepo/dataimport.rst --- a/doc/book/en/devrepo/dataimport.rst Fri Jun 26 16:09:27 2015 +0200 +++ b/doc/book/en/devrepo/dataimport.rst Fri Jun 26 16:10:33 2015 +0200 @@ -1,29 +1,75 @@ -. -*- coding: utf-8 -*- +.. -*- coding: utf-8 -*- .. _dataimport: Dataimport ========== -*CubicWeb* is designed to manipulate huge of amount of data, and provides utilities to do so. They -allow to insert data within different levels of the *CubicWeb* API, allowing different -speed/security tradeoffs. Those keeping all the *CubicWeb* hooks and security will be slower but the -possible errors in insertion (bad data types, integrity error, ...) will be raised. +*CubicWeb* is designed to manipulate huge of amount of data, and provides utilities to do so. + +The main entry point is :mod:`cubicweb.dataimport.importer` which defines an +:class:`ExtEntitiesImporter` class responsible for importing data from an external source in the +form :class:`ExtEntity` objects. An :class:`ExtEntity` is a transitional representation of an +entity to be imported in the CubicWeb instance; building this representation is usually +domain-specific -- e.g. dependent of the kind of data source (RDF, CSV, etc.) -- and is thus the +responsibility of the end-user. + +Along with the importer, a *store* must be selected, which is responsible for insertion of data into +the database. There exists different kind of stores_, allowing to insert data within different +levels of the *CubicWeb* API and with different speed/security tradeoffs. Those keeping all the +*CubicWeb* hooks and security will be slower but the possible errors in insertion (bad data types, +integrity error, ...) will be handled. + -These data import utilities are provided in the package `cubicweb.dataimport`. +Example +------- + +Consider the following schema snippet. + +.. code-block:: python -The API is built on top of the following concepts: + class Person(EntityType): + name = String(required=True) + + class knows(RelationDefinition): + subject = 'Person' + object = 'Person' + +along with some data in a ``people.csv`` file:: -* `Store`, class responsible for inserting values in the backend database + # uri,name,knows + http://www.example.org/alice,Alice, + http://www.example.org/bob,Bob,http://www.example.org/alice -* `ExtEntity`, some intermediate representation of data to import, using external identifier but no - eid, and usually with slightly different representation than the associated entity's schema +The following code (using a shell context) defines a function `extentities_from_csv` to read +`Person` external entities coming from a CSV file and calls the :class:`ExtEntitiesImporter` to +insert corresponding entities and relations into the CubicWeb instance. + +.. code-block:: python + + from cubicweb.dataimport import ucsvreader, RQLObjectStore + from cubicweb.dataimport.importer import ExtEntity, ExtEntitiesImporter -* `Generator`, class or functions that will yield `ExtEntity` from some data source (eg RDF, CSV) + def extentities_from_csv(fpath): + """Yield Person ExtEntities read from `fpath` CSV file.""" + with open(fpath) as f: + for uri, name, knows in ucsvreader(f, skipfirst=True, skip_empty=False): + yield ExtEntity('Personne', uri, + {'nom': set([name]), 'connait': set([knows])}) -* `Importer`, class responsible for turning `ExtEntity`'s extid to eid, doing creation or update - accordingly and may be controlling the insertion order of entities before feeding them to a - `Store` + extenties = extentities_from_csv('people.csv') + store = RQLObjectStore(cnx) + importer = ExtEntitiesImporter(schema, store) + importer.import_entities(extenties) + commit() + rset = cnx.execute('String N WHERE X nom N, X connait Y, Y nom "Alice"') + assert rset[0][0] == u'Bob', rset + +Importer API +------------ + +.. automodule:: cubicweb.dataimport.importer + Stores ~~~~~~ @@ -90,8 +136,3 @@ This store relies on *COPY FROM*/execute many sql commands to directly push data using SQL commands rather than using the whole *CubicWeb* API. For now, **it only works with PostgresSQL** as it requires the *COPY FROM* command. - -ExtEntity and Importer -~~~~~~~~~~~~~~~~~~~~~~ - -.. automodule:: cubicweb.dataimport.importer