--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/doc/tutorials/dataimport/diseasome_parser.py Tue Mar 12 18:31:15 2013 +0100
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+
+"""
+Diseasome data import module.
+Its interface is the ``entities_from_rdf`` function.
+"""
+
+import re
+RE_RELS = re.compile(r'^<(.*?)>\s<(.*?)>\s<(.*?)>\s*\.')
+RE_ATTS = re.compile(r'^<(.*?)>\s<(.*?)>\s"(.*)"(\^\^<(.*?)>|)\s*\.')
+
+MAPPING_ATTS = {'bio2rdfSymbol': 'bio2rdf_symbol',
+ 'label': 'label',
+ 'name': 'name',
+ 'classDegree': 'class_degree',
+ 'degree': 'degree',
+ 'size': 'size'}
+
+MAPPING_RELS = {'geneId': 'gene_id',
+ 'hgncId': 'hgnc_id',
+ 'hgncIdPage': 'hgnc_page',
+ 'sameAs': 'same_as',
+ 'class': 'classes',
+ 'diseaseSubtypeOf': 'subtype_of',
+ 'associatedGene': 'associated_genes',
+ 'possibleDrug': 'possible_drugs',
+ 'type': 'types',
+ 'omim': 'omim',
+ 'omimPage': 'omim_page',
+ 'chromosomalLocation': 'chromosomal_location'}
+
+def _retrieve_reltype(uri):
+ """
+ Retrieve a relation type from an URI.
+
+ Internal function which takes an URI containing a relation type as input
+ and returns the name of the relation.
+ If no URI string is given, then the function returns None.
+ """
+ if uri:
+ return uri.rsplit('/', 1)[-1].rsplit('#', 1)[-1]
+
+def _retrieve_etype(tri_uri):
+ """
+ Retrieve entity type from a triple of URIs.
+
+ Internal function whith takes a tuple of three URIs as input
+ and returns the type of the entity, as obtained from the
+ first member of the tuple.
+ """
+ if tri_uri:
+ return tri_uri.split('> <')[0].rsplit('/', 2)[-2].rstrip('s')
+
+def _retrieve_structure(filename, etypes):
+ """
+ Retrieve a (subject, relation, object) tuples iterator from a file.
+
+ Internal function which takes as input a file name and a tuple of
+ entity types, and returns an iterator of (subject, relation, object)
+ tuples.
+ """
+ with open(filename) as fil:
+ for line in fil:
+ if _retrieve_etype(line) not in etypes:
+ continue
+ match = RE_RELS.match(line)
+ if not match:
+ match = RE_ATTS.match(line)
+ subj = match.group(1)
+ relation = _retrieve_reltype(match.group(2))
+ obj = match.group(3)
+ yield subj, relation, obj
+
+def entities_from_rdf(filename, etypes):
+ """
+ Return entities from an RDF file.
+
+ Module interface function which takes as input a file name and
+ a tuple of entity types, and returns an iterator on the
+ attributes and relations of each entity. The attributes
+ and relations are retrieved as dictionaries.
+
+ >>> for entities, relations in entities_from_rdf('data_file',
+ ('type_1', 'type_2')):
+ ...
+ """
+ entities = {}
+ for subj, rel, obj in _retrieve_structure(filename, etypes):
+ entities.setdefault(subj, {})
+ entities[subj].setdefault('attributes', {})
+ entities[subj].setdefault('relations', {})
+ entities[subj]['attributes'].setdefault('cwuri', unicode(subj))
+ if rel in MAPPING_ATTS:
+ entities[subj]['attributes'].setdefault(MAPPING_ATTS[rel],
+ unicode(obj))
+ if rel in MAPPING_RELS:
+ entities[subj]['relations'].setdefault(MAPPING_RELS[rel], set())
+ entities[subj]['relations'][MAPPING_RELS[rel]].add(unicode(obj))
+ return ((ent.get('attributes'), ent.get('relations'))
+ for ent in entities.itervalues())