doc/tutorials/dataimport/diseasome_import.py
changeset 8836 8a57802d40d3
child 8927 885dea8f16a0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/tutorials/dataimport/diseasome_import.py	Tue Mar 12 18:31:15 2013 +0100
@@ -0,0 +1,195 @@
+# -*- coding: utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""This module imports the Diseasome data into a CubicWeb instance.
+"""
+
+# Python imports
+import sys
+import argparse
+
+# Logilab import, for timing
+from logilab.common.decorators import timed
+
+# CubicWeb imports
+import cubicweb.dataimport as cwdi
+from cubes.dataio import dataimport as mcwdi
+
+# Diseasome parser import
+import diseasome_parser as parser
+
+def _is_of_class(instance, class_name):
+    """Helper function to determine whether an instance is
+    of a specified class or not.
+    Returns a True if this is the case and False otherwise.
+    """
+    if instance.__class__.__name__ == class_name:
+        return True
+    else:
+        return False
+
+@timed
+def diseasome_import(session, file_name, store):
+    """Main function for importing Diseasome data.
+
+    It uses the Diseasome data parser to get the contents of the
+    data from a file, then uses a store for importing the data
+    into a CubicWeb instance.
+
+    >>> diseasome_import(session, 'file_name', Store)
+
+    """
+    exturis = dict(session.execute('Any U, X WHERE X is ExternalUri, X uri U'))
+    uri_to_eid = {}
+    uri_to_etype = {}
+    all_relations = {}
+    etypes = {('http://www4.wiwiss.fu-berlin.de/'
+               'diseasome/resource/diseasome/genes'): 'Gene',
+              ('http://www4.wiwiss.fu-berlin.de/'
+               'diseasome/resource/diseasome/diseases'): 'Disease'}
+    # Read the parsed data
+    for entity, relations in parser.entities_from_rdf(file_name, 
+                                                      ('gene', 'disease')):
+        uri = entity.get('cwuri', None)
+        types = list(relations.get('types', []))
+        if not types:
+            continue
+        etype = etypes.get(types[0])
+        if not etype:
+            sys.stderr.write('Entity type %s not recognized.', types[0])
+            sys.stderr.flush()
+        if _is_of_class(store, 'MassiveObjectStore'):
+            for relation in (set(relations).intersection(('classes', 
+                            'possible_drugs', 'omim', 'omim_page', 
+                            'chromosomal_location', 'same_as', 'gene_id',
+                            'hgnc_id', 'hgnc_page'))):
+                store.init_rtype_table(etype, relation, 'ExternalUri')
+            for relation in set(relations).intersection(('subtype_of',)):
+                store.init_rtype_table(etype, relation, 'Disease')
+            for relation in set(relations).intersection(('associated_genes',)):
+                store.init_rtype_table(etype, relation, 'Gene')
+        # Create the entities
+        ent = store.create_entity(etype, **entity)
+        if not _is_of_class(store, 'MassiveObjectStore'):
+            uri_to_eid[uri] = ent.eid
+            uri_to_etype[uri] = ent.dc_type()
+        else:
+            uri_to_eid[uri] = uri
+            uri_to_etype[uri] = etype
+        # Store relations for after
+        all_relations[uri] = relations
+    # Perform a first commit, of the entities
+    store.flush()
+    kwargs = {}
+    for uri, relations in all_relations.iteritems():
+        from_eid = uri_to_eid.get(uri)
+        # ``subjtype`` should be initialized if ``SQLGenObjectStore`` is used
+        # and there are inlined relations in the schema.
+        # If ``subjtype`` is not given, while ``SQLGenObjectStore`` is used
+        # and there are inlined relations in the schema, the store
+        # tries to infer the type of the subject, but this does not always 
+        # work, e.g. when there are several object types for the relation.
+        # ``subjtype`` is ignored for other stores, or if there are no
+        # inlined relations in the schema.
+        kwargs['subjtype'] = uri_to_etype.get(uri)
+        if not from_eid:
+            continue
+        for rtype, rels in relations.iteritems():
+            if rtype in ('classes', 'possible_drugs', 'omim', 'omim_page',
+                         'chromosomal_location', 'same_as', 'gene_id',
+                         'hgnc_id', 'hgnc_page'):
+                for rel in list(rels):
+                    if rel not in exturis:
+                        # Create the "ExternalUri" entities, which are the
+                        # objects of the relations
+                        extu = store.create_entity('ExternalUri', uri=rel)
+                        if not _is_of_class(store, 'MassiveObjectStore'):
+                            rel_eid = extu.eid
+                        else:
+                            # For the "MassiveObjectStore", the EIDs are 
+                            # in fact the URIs.
+                            rel_eid = rel
+                        exturis[rel] = rel_eid
+                    else:
+                        rel_eid = exturis[rel]
+                    # Create the relations that have "ExternalUri"s as objects
+                    if not _is_of_class(store, 'MassiveObjectStore'):
+                        store.relate(from_eid, rtype, rel_eid, **kwargs)
+                    else:
+                        store.relate_by_iid(from_eid, rtype, rel_eid)
+            elif rtype in ('subtype_of', 'associated_genes'):
+                for rel in list(rels):
+                    to_eid = uri_to_eid.get(rel)
+                    if to_eid:
+                        # Create relations that have objects of other type 
+                        # than "ExternalUri"
+                        if not _is_of_class(store, 'MassiveObjectStore'):
+                            store.relate(from_eid, rtype, to_eid, **kwargs)
+                        else:
+                            store.relate_by_iid(from_eid, rtype, to_eid)
+                    else:
+                        sys.stderr.write('Missing entity with URI %s '
+                                         'for relation %s' % (rel, rtype))
+                        sys.stderr.flush()
+    # Perform a second commit, of the "ExternalUri" entities.
+    # when the stores in the CubicWeb ``dataimport`` module are used,
+    # relations are also committed.
+    store.flush()
+    # If the ``MassiveObjectStore`` is used, then entity and relation metadata
+    # are pushed as well. By metadata we mean information on the creation
+    # time and author.
+    if _is_of_class(store, 'MassiveObjectStore'):
+        store.flush_meta_data()
+        for relation in ('classes', 'possible_drugs', 'omim', 'omim_page', 
+                         'chromosomal_location', 'same_as'):
+            # Afterwards, relations are actually created in the database.
+            store.convert_relations('Disease', relation, 'ExternalUri',
+                                    'cwuri', 'uri')
+        store.convert_relations('Disease', 'subtype_of', 'Disease', 
+                                'cwuri', 'cwuri')
+        store.convert_relations('Disease', 'associated_genes', 'Gene', 
+                                'cwuri', 'cwuri')
+        for relation in ('gene_id', 'hgnc_id', 'hgnc_page', 'same_as'):
+            store.convert_relations('Gene', relation, 'ExternalUri', 
+                                    'cwuri', 'uri')
+        # Clean up temporary tables in the database
+        store.cleanup()
+
+if __name__ == '__main__':
+    # Change sys.argv so that ``cubicweb-ctl shell`` can work out the options
+    # we give to our ``diseasome_import.py`` script.
+    sys.argv = [arg for 
+                arg in sys.argv[sys.argv.index("--") - 1:] if arg != "--"]
+    PARSER = argparse.ArgumentParser(description="Import Diseasome data")
+    PARSER.add_argument("-df", "--datafile", type=str,
+                        help="RDF data file name")
+    PARSER.add_argument("-st", "--store", type=str,
+                        default="RQLObjectStore",
+                        help="data import store")
+    ARGS = PARSER.parse_args()
+    if ARGS.datafile:
+        FILENAME = ARGS.datafile
+        if ARGS.store in (st + "ObjectStore" for 
+                          st in ("RQL", "NoHookRQL", "SQLGen")):
+            IMPORT_STORE = getattr(cwdi, ARGS.store)(session)
+        elif ARGS.store == "MassiveObjectStore":
+            IMPORT_STORE = mcwdi.MassiveObjectStore(session)
+        else:
+            sys.exit("Import store unknown")
+        diseasome_import(session, FILENAME, IMPORT_STORE)
+    else:
+        sys.exit("Data file not found or not specified")