|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 """ |
|
4 Diseasome data import module. |
|
5 Its interface is the ``entities_from_rdf`` function. |
|
6 """ |
|
7 |
|
8 import re |
|
9 RE_RELS = re.compile(r'^<(.*?)>\s<(.*?)>\s<(.*?)>\s*\.') |
|
10 RE_ATTS = re.compile(r'^<(.*?)>\s<(.*?)>\s"(.*)"(\^\^<(.*?)>|)\s*\.') |
|
11 |
|
12 MAPPING_ATTS = {'bio2rdfSymbol': 'bio2rdf_symbol', |
|
13 'label': 'label', |
|
14 'name': 'name', |
|
15 'classDegree': 'class_degree', |
|
16 'degree': 'degree', |
|
17 'size': 'size'} |
|
18 |
|
19 MAPPING_RELS = {'geneId': 'gene_id', |
|
20 'hgncId': 'hgnc_id', |
|
21 'hgncIdPage': 'hgnc_page', |
|
22 'sameAs': 'same_as', |
|
23 'class': 'classes', |
|
24 'diseaseSubtypeOf': 'subtype_of', |
|
25 'associatedGene': 'associated_genes', |
|
26 'possibleDrug': 'possible_drugs', |
|
27 'type': 'types', |
|
28 'omim': 'omim', |
|
29 'omimPage': 'omim_page', |
|
30 'chromosomalLocation': 'chromosomal_location'} |
|
31 |
|
32 def _retrieve_reltype(uri): |
|
33 """ |
|
34 Retrieve a relation type from an URI. |
|
35 |
|
36 Internal function which takes an URI containing a relation type as input |
|
37 and returns the name of the relation. |
|
38 If no URI string is given, then the function returns None. |
|
39 """ |
|
40 if uri: |
|
41 return uri.rsplit('/', 1)[-1].rsplit('#', 1)[-1] |
|
42 |
|
43 def _retrieve_etype(tri_uri): |
|
44 """ |
|
45 Retrieve entity type from a triple of URIs. |
|
46 |
|
47 Internal function whith takes a tuple of three URIs as input |
|
48 and returns the type of the entity, as obtained from the |
|
49 first member of the tuple. |
|
50 """ |
|
51 if tri_uri: |
|
52 return tri_uri.split('> <')[0].rsplit('/', 2)[-2].rstrip('s') |
|
53 |
|
54 def _retrieve_structure(filename, etypes): |
|
55 """ |
|
56 Retrieve a (subject, relation, object) tuples iterator from a file. |
|
57 |
|
58 Internal function which takes as input a file name and a tuple of |
|
59 entity types, and returns an iterator of (subject, relation, object) |
|
60 tuples. |
|
61 """ |
|
62 with open(filename) as fil: |
|
63 for line in fil: |
|
64 if _retrieve_etype(line) not in etypes: |
|
65 continue |
|
66 match = RE_RELS.match(line) |
|
67 if not match: |
|
68 match = RE_ATTS.match(line) |
|
69 subj = match.group(1) |
|
70 relation = _retrieve_reltype(match.group(2)) |
|
71 obj = match.group(3) |
|
72 yield subj, relation, obj |
|
73 |
|
74 def entities_from_rdf(filename, etypes): |
|
75 """ |
|
76 Return entities from an RDF file. |
|
77 |
|
78 Module interface function which takes as input a file name and |
|
79 a tuple of entity types, and returns an iterator on the |
|
80 attributes and relations of each entity. The attributes |
|
81 and relations are retrieved as dictionaries. |
|
82 |
|
83 >>> for entities, relations in entities_from_rdf('data_file', |
|
84 ('type_1', 'type_2')): |
|
85 ... |
|
86 """ |
|
87 entities = {} |
|
88 for subj, rel, obj in _retrieve_structure(filename, etypes): |
|
89 entities.setdefault(subj, {}) |
|
90 entities[subj].setdefault('attributes', {}) |
|
91 entities[subj].setdefault('relations', {}) |
|
92 entities[subj]['attributes'].setdefault('cwuri', unicode(subj)) |
|
93 if rel in MAPPING_ATTS: |
|
94 entities[subj]['attributes'].setdefault(MAPPING_ATTS[rel], |
|
95 unicode(obj)) |
|
96 if rel in MAPPING_RELS: |
|
97 entities[subj]['relations'].setdefault(MAPPING_RELS[rel], set()) |
|
98 entities[subj]['relations'][MAPPING_RELS[rel]].add(unicode(obj)) |
|
99 return ((ent.get('attributes'), ent.get('relations')) |
|
100 for ent in entities.itervalues()) |