cubicweb/dataimport/stores.py
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Wed, 09 Nov 2016 11:42:33 +0100
branch3.24
changeset 11811 f09efeead7f9
parent 11780 307d96c0ab5a
child 12021 cc8135ecfbb8
permissions -rw-r--r--
Fix broken flake8 configuration and flake8 errors which were hidden by this breakage. flake8 --filename options doesn't work as expected: * it's expected to be a shell pattern, using stdlib's fnmatch.fnmatch function internally. This funciton thinks that 'cubicweb/x.py' doesn't match 'cubicweb/x.py' (there must be a reason but that's not the point), hence no file was actually checked ; * as this is a list of pattern, each encountered file is checked against each pattern, leading to run time explosion. So maintain list of files to check in a separated file and give this list to flake8 using unix's xarg command.

# copyright 2003-2016 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This file is part of CubicWeb.
#
# CubicWeb is free software: you can redistribute it and/or modify it under the
# terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
"""
Stores are responsible to insert properly formatted entities and relations into the database. They
have the following API::

    >>> user_eid = store.prepare_insert_entity('CWUser', login=u'johndoe')
    >>> group_eid = store.prepare_insert_entity('CWUser', name=u'unknown')
    >>> store.prepare_insert_relation(user_eid, 'in_group', group_eid)
    >>> store.flush()
    >>> store.commit()
    >>> store.finish()

Some store **requires a flush** to copy data in the database, so if you want to have store
independant code you should explicitly call it. (There may be multiple flushes during the
process, or only one at the end if there is no memory issue). This is different from the
commit which validates the database transaction. At last, the `finish()` method should be called in
case the store requires additional work once everything is done.

* ``prepare_insert_entity(<entity type>, **kwargs) -> eid``: given an entity
  type, attributes and inlined relations, return the eid of the entity to be
  inserted, *with no guarantee that anything has been inserted in database*,

* ``prepare_update_entity(<entity type>, eid, **kwargs) -> None``: given an
  entity type and eid, promise for update given attributes and inlined
  relations *with no guarantee that anything has been inserted in database*,

* ``prepare_insert_relation(eid_from, rtype, eid_to) -> None``: indicate that a
  relation ``rtype`` should be added between entities with eids ``eid_from``
  and ``eid_to``. Similar to ``prepare_insert_entity()``, *there is no
  guarantee that the relation will be inserted in database*,

* ``flush() -> None``: flush any temporary data to database. May be called
  several times during an import,

* ``commit() -> None``: commit the database transaction,

* ``finish() -> None``: additional stuff to do after import is terminated.

.. autoclass:: cubicweb.dataimport.stores.NullStore
.. autoclass:: cubicweb.dataimport.stores.RQLObjectStore
.. autoclass:: cubicweb.dataimport.stores.NoHookRQLObjectStore
.. autoclass:: cubicweb.dataimport.stores.MetadataGenerator
"""
import inspect
import warnings
from datetime import datetime
from copy import copy
from itertools import count

from six import add_metaclass

import pytz

from logilab.common.decorators import cached
from logilab.common.deprecation import deprecated, class_deprecated

from cubicweb.schema import META_RTYPES, VIRTUAL_RTYPES
from cubicweb.server.edition import EditedEntity


class NullStore(object):
    """Store that mainly describe the store API.

    It may be handy to test input data files or to measure time taken by steps above the store
    (e.g. data parsing, importer, etc.): simply give a :class:`NullStore` instance instead of the
    actual store.
    """

    def __init__(self):
        self._eid_gen = count()

    def prepare_insert_entity(self, *args, **kwargs):
        """Given an entity type, attributes and inlined relations, return the inserted entity's
        eid.
        """
        return next(self._eid_gen)

    def prepare_update_entity(self, etype, eid, **kwargs):
        """Given an entity type and eid, update the corresponding entity with specified attributes
        and inlined relations.
        """

    def prepare_insert_relation(self, eid_from, rtype, eid_to, **kwargs):
        """Insert into the database a  relation ``rtype`` between entities with eids ``eid_from``
        and ``eid_to``.
        """

    def flush(self):
        """Flush internal data structures."""

    def commit(self):
        """Commit the database transaction."""

    def finish(self):
        """Import is terminated, do necessary cleanup."""


class RQLObjectStore(NullStore):
    """Store that works by making RQL queries, hence with all the cubicweb's machinery activated.
    """

    def __init__(self, cnx, commit=None):
        if commit is not None:
            warnings.warn('[3.19] commit argument should not be specified '
                          'as the cnx object already provides it.',
                          DeprecationWarning, stacklevel=2)
        self._cnx = cnx
        self._commit = commit or cnx.commit
        # XXX 3.21 deprecated attributes
        self.eids = {}
        self.types = {}

    def rql(self, *args):
        """Execute a RQL query. This is NOT part of the store API."""
        return self._cnx.execute(*args)

    def prepare_insert_entity(self, *args, **kwargs):
        entity = self._cnx.create_entity(*args, **kwargs)
        self.eids[entity.eid] = entity
        self.types.setdefault(args[0], []).append(entity.eid)
        return entity.eid

    def prepare_update_entity(self, etype, eid, **kwargs):
        entity = self._cnx.entity_from_eid(eid)
        assert entity.cw_etype == etype, 'Trying to update with wrong type %s' % etype
        # XXX some inlined relations may already exists
        entity.cw_set(**kwargs)

    def prepare_insert_relation(self, eid_from, rtype, eid_to, **kwargs):
        self.rql('SET X %s Y WHERE X eid %%(x)s, Y eid %%(y)s' % rtype,
                 {'x': int(eid_from), 'y': int(eid_to)})

    def commit(self):
        return self._commit()

    @property
    def session(self):
        warnings.warn('[3.19] deprecated property.', DeprecationWarning, stacklevel=2)
        return self._cnx.repo._get_session(self._cnx.sessionid)

    @deprecated("[3.19] use cnx.find(*args, **kwargs).entities() instead")
    def find_entities(self, *args, **kwargs):
        return self._cnx.find(*args, **kwargs).entities()

    @deprecated("[3.19] use cnx.find(*args, **kwargs).one() instead")
    def find_one_entity(self, *args, **kwargs):
        return self._cnx.find(*args, **kwargs).one()

    @deprecated('[3.21] use prepare_insert_entity instead')
    def create_entity(self, *args, **kwargs):
        eid = self.prepare_insert_entity(*args, **kwargs)
        return self._cnx.entity_from_eid(eid)

    @deprecated('[3.21] use prepare_insert_relation instead')
    def relate(self, eid_from, rtype, eid_to, **kwargs):
        self.prepare_insert_relation(eid_from, rtype, eid_to, **kwargs)


class NoHookRQLObjectStore(RQLObjectStore):
    """Store that works by accessing low-level CubicWeb's source API, with all hooks deactivated. It
    may be given a metadata generator object to handle metadata which are usually handled by hooks.

    Arguments:
    - `cnx`, a connection to the repository
    - `metagen`, optional :class:`MetadataGenerator` instance
    """

    def __init__(self, cnx, metagen=None):
        super(NoHookRQLObjectStore, self).__init__(cnx)
        if metagen is None:
            metagen = MetadataGenerator(cnx)
        if isinstance(metagen, MetadataGenerator):
            metagen = _MetaGeneratorBWCompatWrapper(metagen)
        self.metagen = metagen
        self._system_source = cnx.repo.system_source
        self._rschema = cnx.repo.schema.rschema
        self._create_eid = self._system_source.create_eid
        self._add_relation = self._system_source.add_relation
        self._nb_inserted_entities = 0
        self._nb_inserted_types = 0
        self._nb_inserted_relations = 0
        # deactivate security
        cnx.read_security = False
        cnx.write_security = False

    def prepare_insert_entity(self, etype, **kwargs):
        """Given an entity type, attributes and inlined relations, returns the inserted entity's
        eid.
        """
        for k, v in kwargs.items():
            kwargs[k] = getattr(v, 'eid', v)
        entity, rels = self.metagen.base_etype_dicts(etype)
        # make a copy to keep cached entity pristine
        entity = copy(entity)
        entity.cw_edited = copy(entity.cw_edited)
        entity.cw_clear_relation_cache()
        entity.cw_edited.update(kwargs, skipsec=False)
        cnx = self._cnx
        entity.eid = self._create_eid(cnx)
        entity_source = self.metagen.init_entity(entity)
        self._system_source.add_info(cnx, entity, entity_source)
        self._system_source.add_entity(cnx, entity)
        kwargs = dict()
        if inspect.getargspec(self._add_relation).keywords:
            kwargs['subjtype'] = entity.cw_etype
        for rtype, targeteids in rels.items():
            # targeteids may be a single eid or a list of eids
            inlined = self._rschema(rtype).inlined
            try:
                for targeteid in targeteids:
                    self._add_relation(cnx, entity.eid, rtype, targeteid,
                                       inlined, **kwargs)
            except TypeError:
                self._add_relation(cnx, entity.eid, rtype, targeteids,
                                   inlined, **kwargs)
        self._nb_inserted_entities += 1
        return entity.eid

    # XXX: prepare_update_entity is inherited from RQLObjectStore, it should be reimplemented to
    # actually skip hooks as prepare_insert_entity

    def prepare_insert_relation(self, eid_from, rtype, eid_to, **kwargs):
        """Insert into the database a  relation ``rtype`` between entities with eids ``eid_from``
        and ``eid_to``.
        """
        assert not rtype.startswith('reverse_')
        rschema = self._rschema(rtype)
        self._add_relation(self._cnx, eid_from, rtype, eid_to, rschema.inlined)
        if rschema.symmetric:
            self._add_relation(self._cnx, eid_to, rtype, eid_from, rschema.inlined)
        self._nb_inserted_relations += 1

    @property
    @deprecated('[3.21] deprecated')
    def nb_inserted_entities(self):
        return self._nb_inserted_entities

    @property
    @deprecated('[3.21] deprecated')
    def nb_inserted_types(self):
        return self._nb_inserted_types

    @property
    @deprecated('[3.21] deprecated')
    def nb_inserted_relations(self):
        return self._nb_inserted_relations


class MetadataGenerator(object):
    """Class responsible for generating standard metadata for imported entities. You may want to
    derive it to add application specific's metadata. This class (or a subclass) may either be
    given to a nohook or massive store.

    Parameters:
    * `cnx`: connection to the repository
    * `baseurl`: optional base URL to be used for `cwuri` generation - default to config['base-url']
    * `source`: optional source to be used as `cw_source` for imported entities
    """
    META_RELATIONS = frozenset(META_RTYPES
                               - VIRTUAL_RTYPES
                               - set(('eid', 'cwuri',
                                      'is', 'is_instance_of', 'cw_source')))

    def __init__(self, cnx, baseurl=None, source=None, meta_skipped=()):
        self._cnx = cnx
        if baseurl is None:
            config = cnx.vreg.config
            baseurl = config['base-url'] or config.default_base_url()
        if not baseurl[-1] == '/':
            baseurl += '/'
        self._baseurl = baseurl
        if source is None:
            source = cnx.repo.system_source
        self.source = source
        self._now = datetime.now(pytz.utc)
        # attributes/relations shared by all entities of the same type
        self._etype_attrs = []
        self._etype_rels = []
        # attributes/relations specific to each entity
        self._entity_attrs = ['cwuri']
        rschema = cnx.vreg.schema.rschema
        self.meta_relations = self.META_RELATIONS - set(meta_skipped)
        for rtype in self.meta_relations:
            # skip owned_by / created_by if user is the internal manager
            if cnx.user.eid == -1 and rtype in ('owned_by', 'created_by'):
                continue
            if rschema(rtype).final:
                self._etype_attrs.append(rtype)
            else:
                self._etype_rels.append(rtype)

    # etype is provided in the 3 methods below as proven useful to custom implementation but not
    # used by the default implementation

    def etype_attrs(self, etype):
        """Return the list of attributes to be set for all entities of the given type."""
        return self._etype_attrs[:]

    def etype_rels(self, etype):
        """Return the list of relations to be set for all entities of the given type."""
        return self._etype_rels[:]

    def entity_attrs(self, etype):
        """Return the list of attributes whose value is set per instance, not per type, for the
        given type.
        """
        return self._entity_attrs[:]

    @cached
    def base_etype_attrs(self, etype):
        """Return a dictionary of attributes to be set for all entities of the given type."""
        attrs = {}
        for attr in self.etype_attrs(etype):
            genfunc = self._generator(attr)
            if genfunc:
                attrs[attr] = genfunc(etype)
        return attrs

    @cached
    def base_etype_rels(self, etype):
        """Return a dictionary of relations to be set for all entities of the given type."""
        rels = {}
        for rel in self.etype_rels(etype):
            genfunc = self._generator(rel)
            if genfunc:
                rels[rel] = genfunc(etype)
        return rels

    def init_entity_attrs(self, etype, eid, attrs):
        """Insert into an entity attrs dictionary attributes whose value is set per instance, not per
        type.
        """
        for attr in self.entity_attrs(etype):
            if attr in attrs:
                # already set, skip this attribute
                continue
            genfunc = self._generator(attr)
            if genfunc:
                attrs[attr] = genfunc(etype, eid, attrs)

    def _generator(self, rtype):
        return getattr(self, 'gen_%s' % rtype, None)

    def gen_cwuri(self, etype, eid, attrs):
        assert self._baseurl, 'baseurl is None while generating cwuri'
        return u'%s%s' % (self._baseurl, eid)

    def gen_creation_date(self, etype):
        return self._now

    def gen_modification_date(self, etype):
        return self._now

    def gen_created_by(self, etype):
        return self._cnx.user.eid

    def gen_owned_by(self, etype):
        return self._cnx.user.eid


class _MetaGeneratorBWCompatWrapper(object):
    """Class wrapping a MetadataGenerator to adapt it to the MetaGenerator interface.
    """
    META_RELATIONS = (META_RTYPES
                      - VIRTUAL_RTYPES
                      - set(('eid', 'cwuri',
                             'is', 'is_instance_of', 'cw_source')))

    def __init__(self, mdgenerator):
        self._mdgen = mdgenerator

    @cached
    def base_etype_dicts(self, etype):
        cnx = self._mdgen._cnx
        entity = cnx.vreg['etypes'].etype_class(etype)(cnx)
        # entity are "surface" copied, avoid shared dict between copies
        del entity.cw_extra_kwargs
        entity.cw_edited = EditedEntity(entity)
        attrs = self._mdgen.base_etype_attrs(etype)
        entity.cw_edited.update(attrs, skipsec=False)
        rels = self._mdgen.base_etype_rels(etype)
        return entity, rels

    def init_entity(self, entity):
        attrs = dict(entity.cw_edited)
        self._mdgen.init_entity_attrs(entity.cw_etype, entity.eid, attrs)
        entity.cw_edited.update(attrs, skipsec=False)
        return self._mdgen.source


@add_metaclass(class_deprecated)
class MetaGenerator(object):
    """Class responsible for generating standard metadata for imported entities. You may want to
    derive it to add application specific's metadata.

    Parameters:
    * `cnx`: connection to the repository
    * `baseurl`: optional base URL to be used for `cwuri` generation - default to config['base-url']
    * `source`: optional source to be used as `cw_source` for imported entities
    """
    __deprecation_warning__ = '[3.23] this class is deprecated, use MetadataGenerator instead'

    META_RELATIONS = MetadataGenerator.META_RELATIONS

    def __init__(self, cnx, baseurl=None, source=None):
        self._cnx = cnx
        if baseurl is None:
            config = cnx.vreg.config
            baseurl = config['base-url'] or config.default_base_url()
        if not baseurl[-1] == '/':
            baseurl += '/'
        self.baseurl = baseurl
        if source is None:
            source = cnx.repo.system_source
        self.source = source
        self._now = datetime.now(pytz.utc)
        # attributes/relations shared by all entities of the same type
        self.etype_attrs = []
        self.etype_rels = []
        # attributes/relations specific to each entity
        self.entity_attrs = ['cwuri']
        rschema = cnx.vreg.schema.rschema
        for rtype in self.META_RELATIONS:
            # skip owned_by / created_by if user is the internal manager
            if cnx.user.eid == -1 and rtype in ('owned_by', 'created_by'):
                continue
            if rschema(rtype).final:
                self.etype_attrs.append(rtype)
            else:
                self.etype_rels.append(rtype)

    @cached
    def base_etype_dicts(self, etype):
        entity = self._cnx.vreg['etypes'].etype_class(etype)(self._cnx)
        # entity are "surface" copied, avoid shared dict between copies
        del entity.cw_extra_kwargs
        entity.cw_edited = EditedEntity(entity)
        for attr in self.etype_attrs:
            genfunc = self.generate(attr)
            if genfunc:
                entity.cw_edited.edited_attribute(attr, genfunc(entity))
        rels = {}
        for rel in self.etype_rels:
            genfunc = self.generate(rel)
            if genfunc:
                rels[rel] = genfunc(entity)
        return entity, rels

    def init_entity(self, entity):
        for attr in self.entity_attrs:
            if attr in entity.cw_edited:
                # already set, skip this attribute
                continue
            genfunc = self.generate(attr)
            if genfunc:
                entity.cw_edited.edited_attribute(attr, genfunc(entity))
        return self.source

    def generate(self, rtype):
        return getattr(self, 'gen_%s' % rtype, None)

    def gen_cwuri(self, entity):
        assert self.baseurl, 'baseurl is None while generating cwuri'
        return u'%s%s' % (self.baseurl, entity.eid)

    def gen_creation_date(self, entity):
        return self._now

    def gen_modification_date(self, entity):
        return self._now

    def gen_created_by(self, entity):
        return self._cnx.user.eid

    def gen_owned_by(self, entity):
        return self._cnx.user.eid