cubicweb/dataimport/massive_store.py
author Guillaume Vandevelde <gvandevelde@logilab.fr>
Thu, 13 Feb 2020 13:56:12 +0100
branch3.26
changeset 12876 fe9bd0e937e4
parent 12203 c615f945b38a
child 12567 26744ad37953
permissions -rw-r--r--
[pkg] version 3.26.15
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     1
# coding: utf-8
11305
118d83e65ca8 [dataimport] remove useless assignment in massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11057
diff changeset
     2
# copyright 2015-2016 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     3
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     4
#
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     5
# This file is part of CubicWeb.
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     6
#
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     7
# CubicWeb is free software: you can redistribute it and/or modify it under the
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     8
# terms of the GNU Lesser General Public License as published by the Free
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     9
# Software Foundation, either version 2.1 of the License, or (at your option)
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    10
# any later version.
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    11
#
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    12
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT ANY
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    13
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    14
# A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    15
# details.
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    16
#
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    17
# You should have received a copy of the GNU Lesser General Public License along
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    18
# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    19
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    20
from collections import defaultdict
11316
36c7cd362fc7 [dataimport] add a .schema shortcut attribute on the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11315
diff changeset
    21
from itertools import chain
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
    22
import logging
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
    23
from uuid import uuid4
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    24
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
    25
from six import text_type
10859
375a8232e61c [dataimport] import range from six.moves
Julien Cristau <julien.cristau@logilab.fr>
parents: 10856
diff changeset
    26
from six.moves import range
375a8232e61c [dataimport] import range from six.moves
Julien Cristau <julien.cristau@logilab.fr>
parents: 10856
diff changeset
    27
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    28
from cubicweb.dataimport import stores, pgstore
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
    29
from cubicweb.server.schema2sql import eschema_sql_def
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    30
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    31
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    32
class MassiveObjectStore(stores.RQLObjectStore):
11331
f2ff82dfcd5c [dataimport] add a bit of extra-documentation on the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11330
diff changeset
    33
    """Store for massive import of data, with delayed insertion of meta data.
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    34
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    35
    WARNINGS:
11315
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    36
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    37
    - This store may only be used with PostgreSQL for now, as it relies
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    38
      on the COPY FROM method, and on specific PostgreSQL tables to get all
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    39
      the indexes.
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    40
11315
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    41
    - This store can only insert relations that are not inlined (i.e.,
11707
2c4518fea26f [massive store] Drop deprecated code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11705
diff changeset
    42
      which do *not* have inlined=True in their definition in the schema),
2c4518fea26f [massive store] Drop deprecated code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11705
diff changeset
    43
      unless they are specified as entity attributes.
11315
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    44
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    45
    It should be used as follows:
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    46
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    47
       store = MassiveObjectStore(cnx)
11707
2c4518fea26f [massive store] Drop deprecated code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11705
diff changeset
    48
       eid_p = store.prepare_insert_entity('Person',
11783
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    49
                                           cwuri=u'http://dbpedia.org/toto',
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    50
                                           name=u'Toto')
11707
2c4518fea26f [massive store] Drop deprecated code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11705
diff changeset
    51
       eid_loc = store.prepare_insert_entity('Location',
11783
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    52
                                             cwuri=u'http://geonames.org/11111',
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    53
                                             name=u'Somewhere')
11707
2c4518fea26f [massive store] Drop deprecated code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11705
diff changeset
    54
       store.prepare_insert_relation(eid_p, 'lives_in', eid_loc)
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    55
       store.flush()
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    56
       ...
10882
634cc266e48f [dataimport] fix doc string to stop referring to the old API
Julien Cristau <julien.cristau@logilab.fr>
parents: 10881
diff changeset
    57
       store.commit()
634cc266e48f [dataimport] fix doc string to stop referring to the old API
Julien Cristau <julien.cristau@logilab.fr>
parents: 10881
diff changeset
    58
       store.finish()
11331
f2ff82dfcd5c [dataimport] add a bit of extra-documentation on the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11330
diff changeset
    59
f2ff82dfcd5c [dataimport] add a bit of extra-documentation on the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11330
diff changeset
    60
    Full-text indexation is not handled, you'll have to reindex the proper entity types by yourself
f2ff82dfcd5c [dataimport] add a bit of extra-documentation on the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11330
diff changeset
    61
    if desired.
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    62
    """
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    63
11792
f1911a4638af [massive store] remove on_commit / on_rollback parameters
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11791
diff changeset
    64
    def __init__(self, cnx, slave_mode=False, eids_seq_range=10000, metagen=None):
11783
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    65
        """Create a MassiveObject store, with the following arguments:
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    66
11783
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    67
        - `cnx`, a connection to the repository
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    68
        - `metagen`, optional :class:`MetadataGenerator` instance
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    69
        - `eids_seq_range`: size of eid range reserved by the store for each batch
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    70
        """
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    71
        super(MassiveObjectStore, self).__init__(cnx)
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
    72
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
    73
        self.uuid = text_type(uuid4()).replace('-', '')
11315
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    74
        self.slave_mode = slave_mode
11326
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    75
        if metagen is None:
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    76
            metagen = stores.MetadataGenerator(cnx)
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    77
        self.metagen = metagen
10879
3193d9ede8dd [dataimport] drop extra indirection through MassiveObjectStore._initialized dict
Julien Cristau <julien.cristau@logilab.fr>
parents: 10878
diff changeset
    78
11315
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    79
        self.logger = logging.getLogger('dataimport.massive_store')
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    80
        self.sql = cnx.system_sql
11783
8865c9e55575 [massive store] docstring and __init__ cleanup
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11782
diff changeset
    81
        self.schema = cnx.vreg.schema
11316
36c7cd362fc7 [dataimport] add a .schema shortcut attribute on the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11315
diff changeset
    82
        self.default_values = get_default_values(self.schema)
11871
5f71460236a4 [massive store] Don't store eids_seq_range as a store attribute
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11802
diff changeset
    83
        self.get_next_eid = lambda g=self._get_eid_gen(eids_seq_range): next(g)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
    84
        self._source_dbhelper = cnx.repo.system_source.dbhelper
11315
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    85
        self._dbh = PGHelper(cnx)
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    86
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    87
        self._data_entities = defaultdict(list)
ad826d81e88e [dataimport] rework massive store's __init__
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11314
diff changeset
    88
        self._data_relations = defaultdict(list)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
    89
        self._initialized = {}
11326
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    90
11871
5f71460236a4 [massive store] Don't store eids_seq_range as a store attribute
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11802
diff changeset
    91
    def _get_eid_gen(self, eids_seq_range):
11326
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    92
        """ Function getting the next eid. This is done by preselecting
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    93
        a given number of eids from the 'entities_id_seq', and then
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    94
        storing them"""
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    95
        while True:
11871
5f71460236a4 [massive store] Don't store eids_seq_range as a store attribute
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11802
diff changeset
    96
            last_eid = self._cnx.repo.system_source.create_eid(self._cnx, eids_seq_range)
5f71460236a4 [massive store] Don't store eids_seq_range as a store attribute
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11802
diff changeset
    97
            for eid in range(last_eid - eids_seq_range + 1, last_eid + 1):
11326
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
    98
                yield eid
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    99
11781
4ebd968f364c [massive store] Reintroduce methods that are necessary to properly handle master/slave configuration
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11780
diff changeset
   100
    # master/slaves specific API
4ebd968f364c [massive store] Reintroduce methods that are necessary to properly handle master/slave configuration
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11780
diff changeset
   101
11791
20555214576b [massive store] master_init should commit by default
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11790
diff changeset
   102
    def master_init(self, commit=True):
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   103
        """Initialize database for massive insertion.
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   104
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   105
        This is expected to be called once, by the master store in master/slaves configuration.
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   106
        """
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   107
        assert not self.slave_mode
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   108
        if self not in self._initialized:
12195
81cf4cf60411 [massive store] ensure the cwmassive_initialized table is deleted if needed
David Douard <david.douard@logilab.fr>
parents: 11871
diff changeset
   109
            self.sql('DROP TABLE IF EXISTS cwmassive_initialized')
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   110
            self.sql('CREATE TABLE cwmassive_initialized'
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   111
                     '(retype text, type varchar(128), uuid varchar(32))')
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   112
            self._initialized[self] = None
11791
20555214576b [massive store] master_init should commit by default
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11790
diff changeset
   113
            if commit:
20555214576b [massive store] master_init should commit by default
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11790
diff changeset
   114
                self.commit()
11781
4ebd968f364c [massive store] Reintroduce methods that are necessary to properly handle master/slave configuration
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11780
diff changeset
   115
11322
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   116
    # SQL utilities #########################################################
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   117
11778
9847a097266e [massive store] Rework constraint/index handling
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11777
diff changeset
   118
    def _drop_metadata_constraints(self):
9847a097266e [massive store] Rework constraint/index handling
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11777
diff changeset
   119
        """Drop constraints and indexes for the metadata tables.
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   120
11778
9847a097266e [massive store] Rework constraint/index handling
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11777
diff changeset
   121
        They will be recreated by the `finish` method.
9847a097266e [massive store] Rework constraint/index handling
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11777
diff changeset
   122
        """
11780
307d96c0ab5a [massive store] Follow configuration of the metadata generator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11778
diff changeset
   123
        rtypes = [rtype for rtype in self.metagen.meta_relations
307d96c0ab5a [massive store] Follow configuration of the metadata generator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11778
diff changeset
   124
                  if not self.schema.rschema(rtype).final]
307d96c0ab5a [massive store] Follow configuration of the metadata generator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11778
diff changeset
   125
        rtypes += ('is_instance_of', 'is', 'cw_source')
307d96c0ab5a [massive store] Follow configuration of the metadata generator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11778
diff changeset
   126
        for rtype in rtypes:
307d96c0ab5a [massive store] Follow configuration of the metadata generator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11778
diff changeset
   127
            self._dbh.drop_constraints(rtype + '_relation')
307d96c0ab5a [massive store] Follow configuration of the metadata generator
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11778
diff changeset
   128
            self._dbh.drop_indexes(rtype + '_relation')
11778
9847a097266e [massive store] Rework constraint/index handling
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11777
diff changeset
   129
        # don't drop constraints for the entities table, the only one is the primary key's index on
9847a097266e [massive store] Rework constraint/index handling
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11777
diff changeset
   130
        # eid and we want to keep it
9847a097266e [massive store] Rework constraint/index handling
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11777
diff changeset
   131
        self._dbh.drop_indexes('entities')
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   132
11026
ce9b3886955d [dataimport] remove eids_seq_start attribute from massive store
Julien Cristau <julien.cristau@logilab.fr>
parents: 11025
diff changeset
   133
    def restart_eid_sequence(self, start_eid):
11323
e9120da559f5 [dataimport] use sql shortcut
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11322
diff changeset
   134
        self.sql(self._cnx.repo.system_source.dbhelper.sql_restart_numrange(
11026
ce9b3886955d [dataimport] remove eids_seq_start attribute from massive store
Julien Cristau <julien.cristau@logilab.fr>
parents: 11025
diff changeset
   135
            'entities_id_seq', initial_value=start_eid))
ce9b3886955d [dataimport] remove eids_seq_start attribute from massive store
Julien Cristau <julien.cristau@logilab.fr>
parents: 11025
diff changeset
   136
        self._cnx.commit()
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   137
10863
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   138
    # store api ################################################################
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   139
11785
0cea67f41d0c [massive store] Delay metadata attributes and default values handling to flush_entities
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11784
diff changeset
   140
    def prepare_insert_entity(self, etype, **data):
10863
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   141
        """Given an entity type, attributes and inlined relations, returns the inserted entity's
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   142
        eid.
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   143
        """
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   144
        if etype not in self._initialized:
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   145
            if not self.slave_mode:
11791
20555214576b [massive store] master_init should commit by default
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11790
diff changeset
   146
                self.master_init(commit=False)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   147
            tablename = 'cw_%s' % etype.lower()
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   148
            tmp_tablename = '%s_%s' % (tablename, self.uuid)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   149
            self.sql("INSERT INTO cwmassive_initialized VALUES (%(e)s, 'etype', %(uuid)s)",
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   150
                     {'e': etype, 'uuid': self.uuid})
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   151
            attr_defs = eschema_sql_def(self._source_dbhelper, self.schema[etype])
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   152
            self.sql('CREATE TABLE %s(%s);' % (tmp_tablename,
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   153
                                               ', '.join('cw_%s %s' % (column, sqltype)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   154
                                                         for column, sqltype in attr_defs)))
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   155
            self._initialized[etype] = [attr for attr, _ in attr_defs]
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   156
11326
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
   157
        if 'eid' not in data:
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
   158
            # If eid is not given and the eids sequence is set, use the value from the sequence
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
   159
            eid = self.get_next_eid()
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
   160
            data['eid'] = eid
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
   161
        self._data_entities[etype].append(data)
06eeac9389a3 [dataimport] introduce usage of MetadataGenerator into the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11325
diff changeset
   162
        return data['eid']
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   163
10863
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   164
    def prepare_insert_relation(self, eid_from, rtype, eid_to, **kwargs):
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   165
        """Insert into the database a  relation ``rtype`` between entities with eids ``eid_from``
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   166
        and ``eid_to``.
11331
f2ff82dfcd5c [dataimport] add a bit of extra-documentation on the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11330
diff changeset
   167
f2ff82dfcd5c [dataimport] add a bit of extra-documentation on the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11330
diff changeset
   168
        Relation must not be inlined.
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   169
        """
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   170
        if rtype not in self._initialized:
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   171
            if not self.slave_mode:
11791
20555214576b [massive store] master_init should commit by default
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11790
diff changeset
   172
                self.master_init(commit=False)
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   173
            assert not self._cnx.vreg.schema.rschema(rtype).inlined
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   174
            self._initialized[rtype] = None
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   175
            tablename = '%s_relation' % rtype.lower()
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   176
            tmp_tablename = '%s_%s' % (tablename, self.uuid)
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   177
            self.sql("INSERT INTO cwmassive_initialized VALUES (%(r)s, 'rtype', %(uuid)s)",
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   178
                     {'r': rtype, 'uuid': self.uuid})
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   179
            self.sql('CREATE TABLE %s(eid_from integer, eid_to integer)' % tmp_tablename)
10863
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   180
        self._data_relations[rtype].append({'eid_from': eid_from, 'eid_to': eid_to})
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   181
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   182
    def flush(self):
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   183
        """Flush the data"""
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   184
        self.flush_entities()
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   185
        self.flush_relations()
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   186
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   187
    def finish(self):
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   188
        """Remove temporary tables and columns."""
12171
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   189
        try:
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   190
            self._finish()
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   191
            self._cnx.commit()
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   192
        except Exception:
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   193
            self._cnx.rollback()
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   194
            raise
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   195
        finally:
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   196
            # delete the meta data table
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   197
            self.sql('DROP TABLE IF EXISTS cwmassive_initialized')
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   198
            self.commit()
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   199
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   200
    def _finish(self):
970c32a4c7b7 [massive store] Ensure temporary metadata table get dropped
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11871
diff changeset
   201
        """Remove temporary tables and columns."""
11786
f5d26d3648d4 [massive store] Turn a runtime error into an assertion
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11785
diff changeset
   202
        assert not self.slave_mode, 'finish method should only be called by the master store'
10863
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   203
        self.logger.info("Start cleaning")
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   204
        # Get all the initialized etypes/rtypes
10878
fda5e42037a9 [dataimport] remove remaining references to dataio from MassiveObjectStore
Julien Cristau <julien.cristau@logilab.fr>
parents: 10877
diff changeset
   205
        if self._dbh.table_exists('cwmassive_initialized'):
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   206
            cu = self.sql('SELECT retype, type, uuid FROM cwmassive_initialized')
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   207
            entities = defaultdict(list)
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   208
            relations = defaultdict(list)
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   209
            for retype, _type, uuid in cu.fetchall():
11774
51c160677afe [repository] Drop the entities.extid column and associated cache
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11773
diff changeset
   210
                if _type == 'rtype':
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   211
                    relations[retype].append(uuid)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   212
                else:  # _type = 'etype'
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   213
                    entities[retype].append(uuid)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   214
            # if there is some entities to insert, delete constraint on metadata tables once for all
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   215
            if entities:
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   216
                self._drop_metadata_constraints()
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   217
            # get back entity data from the temporary tables
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   218
            for etype, uuids in entities.items():
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   219
                tablename = 'cw_%s' % etype.lower()
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   220
                attr_defs = eschema_sql_def(self._source_dbhelper, self.schema[etype])
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   221
                columns = ','.join('cw_%s' % attr for attr, _ in attr_defs)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   222
                self._dbh.drop_constraints(tablename)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   223
                self._dbh.drop_indexes(tablename)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   224
                for uuid in uuids:
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   225
                    tmp_tablename = '%s_%s' % (tablename, uuid)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   226
                    self.sql('INSERT INTO %(table)s(%(columns)s) '
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   227
                             'SELECT %(columns)s FROM %(tmp_table)s'
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   228
                             % {'table': tablename, 'tmp_table': tmp_tablename,
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   229
                                'columns': columns})
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   230
                    self._insert_etype_metadata(etype, tmp_tablename)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   231
                    self._tmp_data_cleanup(tmp_tablename, etype, uuid)
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   232
            # get back relation data from the temporary tables
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   233
            for rtype, uuids in relations.items():
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   234
                tablename = '%s_relation' % rtype.lower()
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   235
                self._dbh.drop_constraints(tablename)
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   236
                self._dbh.drop_indexes(tablename)
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   237
                for uuid in uuids:
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   238
                    tmp_tablename = '%s_%s' % (tablename, uuid)
11790
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   239
                    self.fill_relation_table(tablename, tmp_tablename)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   240
                    self._tmp_data_cleanup(tmp_tablename, rtype, uuid)
11787
05b3e44c135f [massive store] Docstring / comment cleanups
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11786
diff changeset
   241
        # restore all deleted indexes and constraints
11322
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   242
        self._dbh.restore_indexes_and_constraints()
10863
8e1f6de61300 [dataimport] implement new store API on massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10861
diff changeset
   243
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   244
    def _insert_etype_metadata(self, etype, tmp_tablename):
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   245
        """Massive insertion of meta data for `etype`, with new entities in `tmp_tablename`.
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   246
        """
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   247
        # insert standard metadata relations
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   248
        for rtype, eid in self.metagen.base_etype_rels(etype).items():
11790
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   249
            self.fill_meta_relation_table(tmp_tablename, rtype, eid)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   250
        # insert cw_source, is and is_instance_of relations (normally handled by the system source)
11790
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   251
        self.fill_meta_relation_table(tmp_tablename, 'cw_source', self.metagen.source.eid)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   252
        eschema = self.schema[etype]
11790
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   253
        self.fill_meta_relation_table(tmp_tablename, 'is', eschema.eid)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   254
        for parent_eschema in chain(eschema.ancestors(), [eschema]):
11790
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   255
            self.fill_meta_relation_table(tmp_tablename, 'is_instance_of', parent_eschema.eid)
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   256
        self.fill_entities_table(etype, tmp_tablename)
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   257
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   258
    def fill_entities_table(self, etype, tmp_tablename):
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   259
        # finally insert records into the entities table
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   260
        self.sql("INSERT INTO entities(eid, type) "
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   261
                 "SELECT cw_eid, '%s' FROM %s "
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   262
                 "WHERE NOT EXISTS (SELECT 1 FROM entities WHERE eid=cw_eid)"
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   263
                 % (etype, tmp_tablename))
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   264
11790
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   265
    def fill_relation_table(self, tablename, tmp_tablename):
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   266
        # XXX no index on the original relation table, EXISTS subquery may be sloooow
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   267
        self.sql('INSERT INTO %(table)s(eid_from, eid_to) SELECT DISTINCT '
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   268
                 'T.eid_from, T.eid_to FROM %(tmp_table)s AS T '
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   269
                 'WHERE NOT EXISTS (SELECT 1 FROM %(table)s AS TT WHERE '
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   270
                 'TT.eid_from=T.eid_from AND TT.eid_to=T.eid_to);'
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   271
                 % {'table': tablename, 'tmp_table': tmp_tablename})
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   272
04607da552ac [massive store] isolate SQL statements that one may want to customize
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11789
diff changeset
   273
    def fill_meta_relation_table(self, tmp_tablename, rtype, eid_to):
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   274
        self.sql("INSERT INTO %s_relation(eid_from, eid_to) SELECT cw_eid, %s FROM %s "
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   275
                 "WHERE NOT EXISTS (SELECT 1 FROM entities WHERE eid=cw_eid)"
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   276
                 % (rtype, eid_to, tmp_tablename))
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   277
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   278
    def _tmp_data_cleanup(self, tmp_tablename, ertype, uuid):
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   279
        """Drop temporary relation table and record from cwmassive_initialized."""
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   280
        self.sql('DROP TABLE %(tmp_table)s' % {'tmp_table': tmp_tablename})
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   281
        self.sql('DELETE FROM cwmassive_initialized '
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   282
                 'WHERE retype = %(rtype)s AND uuid = %(uuid)s',
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   283
                 {'rtype': ertype, 'uuid': uuid})
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   284
11313
682b15eb2dd2 [dataimport] flake8
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11312
diff changeset
   285
    # FLUSH #################################################################
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   286
11707
2c4518fea26f [massive store] Drop deprecated code
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11705
diff changeset
   287
    def flush_relations(self):
11787
05b3e44c135f [massive store] Docstring / comment cleanups
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11786
diff changeset
   288
        """Flush the relations data from in-memory structures to a temporary table."""
10856
b839167d99a4 [dataimport] dict.iteritems() → dict.items()
Julien Cristau <julien.cristau@logilab.fr>
parents: 10855
diff changeset
   289
        for rtype, data in self._data_relations.items():
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   290
            if not data:
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   291
                # There is no data for these etype for this flush round.
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   292
                continue
10871
1d4a94d04ec6 [dataimport] remove replace_sep parameter from massive store
Julien Cristau <julien.cristau@logilab.fr>
parents: 10870
diff changeset
   293
            buf = pgstore._create_copyfrom_buffer(data, ('eid_from', 'eid_to'))
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   294
            cursor = self._cnx.cnxset.cu
11784
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   295
            tablename = '%s_relation' % rtype.lower()
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   296
            tmp_tablename = '%s_%s' % (tablename, self.uuid)
c1aa50a88de3 [massive store] Use a slave specific table for relation insertion in the massive store
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11783
diff changeset
   297
            cursor.copy_from(buf, tmp_tablename, null='NULL', columns=('eid_from', 'eid_to'))
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   298
            # Clear data cache
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   299
            self._data_relations[rtype] = []
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   300
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   301
    def flush_entities(self):
11787
05b3e44c135f [massive store] Docstring / comment cleanups
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11786
diff changeset
   302
        """Flush the entities data from in-memory structures to a temporary table."""
11785
0cea67f41d0c [massive store] Delay metadata attributes and default values handling to flush_entities
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11784
diff changeset
   303
        metagen = self.metagen
10856
b839167d99a4 [dataimport] dict.iteritems() → dict.items()
Julien Cristau <julien.cristau@logilab.fr>
parents: 10855
diff changeset
   304
        for etype, data in self._data_entities.items():
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   305
            if not data:
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   306
                # There is no data for these etype for this flush round.
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   307
                continue
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   308
            attrs = self._initialized[etype]
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   309
            _base_data = dict.fromkeys(attrs)
11785
0cea67f41d0c [massive store] Delay metadata attributes and default values handling to flush_entities
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11784
diff changeset
   310
            _base_data.update(self.default_values[etype])
0cea67f41d0c [massive store] Delay metadata attributes and default values handling to flush_entities
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11784
diff changeset
   311
            _base_data.update(metagen.base_etype_attrs(etype))
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   312
            _data = []
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   313
            for d in data:
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   314
                # do this first on `d`, because it won't fill keys associated to None as provided by
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   315
                # `_base_data`
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   316
                metagen.init_entity_attrs(etype, d['eid'], d)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   317
                # XXX warn/raise if there is some key not in attrs?
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   318
                _d = _base_data.copy()
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   319
                _d.update(d)
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   320
                _data.append(_d)
11789
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   321
            buf = pgstore._create_copyfrom_buffer(_data, attrs)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   322
            tablename = 'cw_%s' % etype.lower()
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   323
            tmp_tablename = '%s_%s' % (tablename, self.uuid)
71df2811b422 [massive store] Store entities in temporary table as well
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11788
diff changeset
   324
            columns = ['cw_%s' % attr for attr in attrs]
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   325
            cursor = self._cnx.cnxset.cu
11792
f1911a4638af [massive store] remove on_commit / on_rollback parameters
Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
parents: 11791
diff changeset
   326
            cursor.copy_from(buf, tmp_tablename, null='NULL', columns=columns)
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   327
            # Clear data cache
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   328
            self._data_entities[etype] = []
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   329
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   330
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   331
def get_default_values(schema):
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   332
    """analyzes yams ``schema`` and returns the list of default values.
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   333
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   334
    The returned value is a dictionary mapping entity types to a
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   335
    sub-dictionnaries mapping attribute names -> default values.
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   336
    """
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   337
    default_values = {}
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   338
    # iterates on all entity types
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   339
    for eschema in schema.entities():
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   340
        # for each entity type, iterates on attribute definitions
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   341
        default_values[eschema.type] = eschema_constraints = {}
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   342
        for rschema, _ in eschema.attribute_definitions():
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   343
            # for each attribute, if a size constraint is found,
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   344
            # append it to the size constraint list
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   345
            if eschema.default(rschema.type) is not None:
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   346
                eschema_constraints[rschema.type] = eschema.default(rschema.type)
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   347
    return default_values
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   348
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   349
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   350
class PGHelper(object):
11314
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   351
    """This class provides some helper methods to manipulate a postgres database metadata (index and
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   352
    constraints).
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   353
    """
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   354
11310
e0b7277e5394 [dataimport] PGHelper should be responsible to retrieve the database schema
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11308
diff changeset
   355
    def __init__(self, cnx):
11314
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   356
        self.sql = cnx.system_sql
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   357
        # Deals with pg schema, see #3216686
11310
e0b7277e5394 [dataimport] PGHelper should be responsible to retrieve the database schema
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11308
diff changeset
   358
        pg_schema = cnx.repo.config.system_source_config.get('db-namespace') or 'public'
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   359
        self.pg_schema = pg_schema
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   360
11322
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   361
    def drop_indexes(self, tablename):
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   362
        """Drop indexes and constraints, storing them in a table for later restore."""
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   363
        # Create a table to save the constraints, it allows reloading even after crash
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   364
        self.sql('CREATE TABLE IF NOT EXISTS cwmassive_constraints(sql TEXT, insert_order SERIAL)')
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   365
        indexes = self.table_indexes(tablename)
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   366
        for name, query in indexes.items():
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   367
            self.sql('INSERT INTO cwmassive_constraints(sql) VALUES (%(sql)s)', {'sql': query})
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   368
            self.sql('DROP INDEX %s' % name)
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   369
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   370
    def drop_constraints(self, tablename):
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   371
        self.sql('CREATE TABLE IF NOT EXISTS cwmassive_constraints(sql TEXT, insert_order SERIAL)')
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   372
        constraints = self.table_constraints(tablename)
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   373
        for name, query in constraints.items():
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   374
            self.sql('INSERT INTO cwmassive_constraints(sql) VALUES (%(sql)s)', {'sql': query})
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   375
            self.sql('ALTER TABLE %s DROP CONSTRAINT %s' % (tablename, name))
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   376
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   377
    def restore_indexes_and_constraints(self):
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   378
        """Restore indexes and constraints."""
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   379
        if not self.table_exists('cwmassive_constraints'):
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   380
            return
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   381
        cu = self.sql('SELECT sql, insert_order FROM cwmassive_constraints '
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   382
                      'ORDER BY insert_order DESC')
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   383
        for query, order in cu.fetchall():
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   384
            self.sql(query)
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   385
            self.sql('DELETE FROM cwmassive_constraints WHERE insert_order=%(order)s',
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   386
                     {'order': order})
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   387
        self.sql('DROP TABLE cwmassive_constraints')
21316020eae3 [dataimport] move cwmassive_constraint temporary table handling to the PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11321
diff changeset
   388
11314
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   389
    def table_exists(self, tablename):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   390
        """Return True if the given table already exists in the database."""
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   391
        cu = self.sql('SELECT 1 from information_schema.tables '
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   392
                      'WHERE table_name=%(t)s AND table_schema=%(s)s',
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   393
                      {'t': tablename, 's': self.pg_schema})
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   394
        return bool(cu.fetchone())
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   395
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   396
    def table_indexes(self, tablename):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   397
        """Return a dictionary of indexes {index name: index sql}, constraints included."""
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   398
        indexes = {}
11314
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   399
        for name in self._index_names(tablename):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   400
            indexes[name] = self._index_sql(name)
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   401
        return indexes
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   402
11314
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   403
    def table_constraints(self, tablename):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   404
        """Return a dictionary of constraints {constraint name: constraint sql}."""
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   405
        constraints = {}
11314
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   406
        for name in self._constraint_names(tablename):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   407
            query = self._constraint_sql(name)
10853
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   408
            constraints[name] = 'ALTER TABLE %s ADD CONSTRAINT %s %s' % (tablename, name, query)
de741492538d [dataimport] backport massive store from dataio cube
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   409
        return constraints
11314
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   410
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   411
    def _index_names(self, tablename):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   412
        """Return the names of all indexes in the given table (including constraints.)"""
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   413
        cu = self.sql("SELECT c.relname FROM pg_catalog.pg_class c "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   414
                      "JOIN pg_catalog.pg_index i ON i.indexrelid = c.oid "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   415
                      "JOIN pg_catalog.pg_class c2 ON i.indrelid = c2.oid "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   416
                      "LEFT JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   417
                      "LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   418
                      "WHERE c.relkind IN ('i','') "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   419
                      " AND c2.relname = %(t)s "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   420
                      " AND i.indisprimary = FALSE "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   421
                      " AND n.nspname NOT IN ('pg_catalog', 'pg_toast') "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   422
                      " AND pg_catalog.pg_table_is_visible(c.oid);", {'t': tablename})
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   423
        return [name for name, in cu.fetchall()]
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   424
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   425
    def _constraint_names(self, tablename):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   426
        """Return the names of all constraints in the given table."""
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   427
        cu = self.sql("SELECT i.conname FROM pg_catalog.pg_class c "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   428
                      "JOIN pg_catalog.pg_constraint i ON i.conrelid = c.oid "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   429
                      "JOIN pg_catalog.pg_class c2 ON i.conrelid=c2.oid "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   430
                      "LEFT JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   431
                      "LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   432
                      "WHERE c2.relname = %(t)s "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   433
                      "AND n.nspname NOT IN ('pg_catalog', 'pg_toast') "
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   434
                      "AND pg_catalog.pg_table_is_visible(c.oid)", {'t': tablename})
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   435
        return [name for name, in cu.fetchall()]
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   436
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   437
    def _index_sql(self, name):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   438
        """Return the SQL to be used to recreate the index of the given name."""
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   439
        return self.sql('SELECT pg_get_indexdef(c.oid) FROM pg_catalog.pg_class c '
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   440
                        'LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace '
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   441
                        'WHERE c.relname = %(r)s AND n.nspname=%(n)s',
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   442
                        {'r': name, 'n': self.pg_schema}).fetchone()[0]
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   443
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   444
    def _constraint_sql(self, name):
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   445
        """Return the SQL to be used to recreate the constraint."""
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   446
        return self.sql('SELECT pg_get_constraintdef(c.oid) FROM pg_catalog.pg_constraint c '
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   447
                        'LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.connamespace '
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   448
                        'WHERE c.conname = %(r)s AND n.nspname=%(n)s',
c258bd6b20d8 [dataimport] rework PGHelper class
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11313
diff changeset
   449
                        {'r': name, 'n': self.pg_schema}).fetchone()[0]