doc/tutorials/dataimport/diseasome_parser.py
changeset 8836 8a57802d40d3
child 9702 c2108dbfb508
equal deleted inserted replaced
8835:3612b760488b 8836:8a57802d40d3
       
     1 # -*- coding: utf-8 -*-
       
     2 
       
     3 """
       
     4 Diseasome data import module.
       
     5 Its interface is the ``entities_from_rdf`` function.
       
     6 """
       
     7 
       
     8 import re
       
     9 RE_RELS = re.compile(r'^<(.*?)>\s<(.*?)>\s<(.*?)>\s*\.')
       
    10 RE_ATTS = re.compile(r'^<(.*?)>\s<(.*?)>\s"(.*)"(\^\^<(.*?)>|)\s*\.')
       
    11 
       
    12 MAPPING_ATTS = {'bio2rdfSymbol': 'bio2rdf_symbol',
       
    13                 'label': 'label',
       
    14                 'name': 'name',
       
    15                 'classDegree': 'class_degree',
       
    16                 'degree': 'degree',
       
    17                 'size': 'size'}
       
    18 
       
    19 MAPPING_RELS = {'geneId': 'gene_id',
       
    20                 'hgncId': 'hgnc_id', 
       
    21                 'hgncIdPage': 'hgnc_page', 
       
    22                 'sameAs': 'same_as', 
       
    23                 'class': 'classes', 
       
    24                 'diseaseSubtypeOf': 'subtype_of', 
       
    25                 'associatedGene': 'associated_genes', 
       
    26                 'possibleDrug': 'possible_drugs',
       
    27                 'type': 'types',
       
    28                 'omim': 'omim', 
       
    29                 'omimPage': 'omim_page', 
       
    30                 'chromosomalLocation': 'chromosomal_location'}
       
    31 
       
    32 def _retrieve_reltype(uri):
       
    33     """
       
    34     Retrieve a relation type from an URI.
       
    35 
       
    36     Internal function which takes an URI containing a relation type as input
       
    37     and returns the name of the relation.
       
    38     If no URI string is given, then the function returns None.
       
    39     """
       
    40     if uri:
       
    41         return uri.rsplit('/', 1)[-1].rsplit('#', 1)[-1]
       
    42 
       
    43 def _retrieve_etype(tri_uri):
       
    44     """
       
    45     Retrieve entity type from a triple of URIs.
       
    46 
       
    47     Internal function whith takes a tuple of three URIs as input
       
    48     and returns the type of the entity, as obtained from the
       
    49     first member of the tuple.
       
    50     """
       
    51     if tri_uri:
       
    52         return tri_uri.split('> <')[0].rsplit('/', 2)[-2].rstrip('s')
       
    53 
       
    54 def _retrieve_structure(filename, etypes):
       
    55     """
       
    56     Retrieve a (subject, relation, object) tuples iterator from a file.
       
    57 
       
    58     Internal function which takes as input a file name and a tuple of 
       
    59     entity types, and returns an iterator of (subject, relation, object)
       
    60     tuples.
       
    61     """
       
    62     with open(filename) as fil:
       
    63         for line in fil:
       
    64             if _retrieve_etype(line) not in etypes:
       
    65                 continue
       
    66             match = RE_RELS.match(line)
       
    67             if not match:
       
    68                 match = RE_ATTS.match(line)
       
    69             subj = match.group(1)
       
    70             relation = _retrieve_reltype(match.group(2))
       
    71             obj = match.group(3)
       
    72             yield subj, relation, obj
       
    73 
       
    74 def entities_from_rdf(filename, etypes):
       
    75     """
       
    76     Return entities from an RDF file.
       
    77 
       
    78     Module interface function which takes as input a file name and
       
    79     a tuple of entity types, and returns an iterator on the 
       
    80     attributes and relations of each entity. The attributes
       
    81     and relations are retrieved as dictionaries.
       
    82     
       
    83     >>> for entities, relations in entities_from_rdf('data_file', 
       
    84                                                      ('type_1', 'type_2')):
       
    85         ...
       
    86     """
       
    87     entities = {}
       
    88     for subj, rel, obj in _retrieve_structure(filename, etypes):
       
    89         entities.setdefault(subj, {})
       
    90         entities[subj].setdefault('attributes', {})
       
    91         entities[subj].setdefault('relations', {})
       
    92         entities[subj]['attributes'].setdefault('cwuri', unicode(subj))
       
    93         if rel in MAPPING_ATTS:
       
    94             entities[subj]['attributes'].setdefault(MAPPING_ATTS[rel], 
       
    95                                                     unicode(obj))
       
    96         if rel in MAPPING_RELS:
       
    97             entities[subj]['relations'].setdefault(MAPPING_RELS[rel], set())
       
    98             entities[subj]['relations'][MAPPING_RELS[rel]].add(unicode(obj))
       
    99     return ((ent.get('attributes'), ent.get('relations')) 
       
   100             for ent in entities.itervalues())