sobjects/parsers.py
changeset 8188 1867e252e487
parent 8187 981f6e487788
child 8189 2ee0ef069fa7
equal deleted inserted replaced
8187:981f6e487788 8188:1867e252e487
     1 # copyright 2010-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
       
     2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
       
     3 #
       
     4 # This file is part of CubicWeb.
       
     5 #
       
     6 # CubicWeb is free software: you can redistribute it and/or modify it under the
       
     7 # terms of the GNU Lesser General Public License as published by the Free
       
     8 # Software Foundation, either version 2.1 of the License, or (at your option)
       
     9 # any later version.
       
    10 #
       
    11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT
       
    12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
       
    13 # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
       
    14 # details.
       
    15 #
       
    16 # You should have received a copy of the GNU Lesser General Public License along
       
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
       
    18 """datafeed parser for xml generated by cubicweb
       
    19 
       
    20 Example of mapping for CWEntityXMLParser::
       
    21 
       
    22   {u'CWUser': {                                        # EntityType
       
    23       (u'in_group', u'subject', u'link'): [            # (rtype, role, action)
       
    24           (u'CWGroup', {u'linkattr': u'name'})],       #   -> rules = [(EntityType, options), ...]
       
    25       (u'tags', u'object', u'link-or-create'): [       # (...)
       
    26           (u'Tag', {u'linkattr': u'name'})],           #   -> ...
       
    27       (u'use_email', u'subject', u'copy'): [           # (...)
       
    28           (u'EmailAddress', {})]                       #   -> ...
       
    29       }
       
    30    }
       
    31 
       
    32 """
       
    33 
       
    34 import os.path as osp
       
    35 from datetime import datetime, timedelta, time
       
    36 from urllib import urlencode
       
    37 from cgi import parse_qs # in urlparse with python >= 2.6
       
    38 
       
    39 from logilab.common.date import todate, totime
       
    40 from logilab.common.textutils import splitstrip, text_to_dict
       
    41 from logilab.common.decorators import classproperty
       
    42 
       
    43 from yams.constraints import BASE_CONVERTERS
       
    44 from yams.schema import role_name as rn
       
    45 
       
    46 from cubicweb import ValidationError, RegistryException, typed_eid
       
    47 from cubicweb.view import Component
       
    48 from cubicweb.server.sources import datafeed
       
    49 from cubicweb.server.hook import match_rtype
       
    50 
       
    51 # XXX see cubicweb.cwvreg.YAMS_TO_PY
       
    52 # XXX see cubicweb.web.views.xmlrss.SERIALIZERS
       
    53 DEFAULT_CONVERTERS = BASE_CONVERTERS.copy()
       
    54 DEFAULT_CONVERTERS['String'] = unicode
       
    55 DEFAULT_CONVERTERS['Password'] = lambda x: x.encode('utf8')
       
    56 def convert_date(ustr):
       
    57     return todate(datetime.strptime(ustr, '%Y-%m-%d'))
       
    58 DEFAULT_CONVERTERS['Date'] = convert_date
       
    59 def convert_datetime(ustr):
       
    60     if '.' in ustr: # assume %Y-%m-%d %H:%M:%S.mmmmmm
       
    61         ustr = ustr.split('.',1)[0]
       
    62     return datetime.strptime(ustr, '%Y-%m-%d %H:%M:%S')
       
    63 DEFAULT_CONVERTERS['Datetime'] = convert_datetime
       
    64 # XXX handle timezone, though this will be enough as TZDatetime are
       
    65 # serialized without time zone by default (UTC time). See
       
    66 # cw.web.views.xmlrss.SERIALIZERS.
       
    67 DEFAULT_CONVERTERS['TZDatetime'] = convert_datetime
       
    68 def convert_time(ustr):
       
    69     return totime(datetime.strptime(ustr, '%H:%M:%S'))
       
    70 DEFAULT_CONVERTERS['Time'] = convert_time
       
    71 DEFAULT_CONVERTERS['TZTime'] = convert_time
       
    72 def convert_interval(ustr):
       
    73     return time(seconds=int(ustr))
       
    74 DEFAULT_CONVERTERS['Interval'] = convert_interval
       
    75 
       
    76 def extract_typed_attrs(eschema, stringdict, converters=DEFAULT_CONVERTERS):
       
    77     typeddict = {}
       
    78     for rschema in eschema.subject_relations():
       
    79         if rschema.final and rschema in stringdict:
       
    80             if rschema in ('eid', 'cwuri', 'cwtype', 'cwsource'):
       
    81                 continue
       
    82             attrtype = eschema.destination(rschema)
       
    83             value = stringdict[rschema]
       
    84             if value is not None:
       
    85                 value = converters[attrtype](value)
       
    86             typeddict[rschema.type] = value
       
    87     return typeddict
       
    88 
       
    89 def rtype_role_rql(rtype, role):
       
    90     if role == 'object':
       
    91         return 'Y %s X WHERE X eid %%(x)s' % rtype
       
    92     else:
       
    93         return 'X %s Y WHERE X eid %%(x)s' % rtype
       
    94 
       
    95 
       
    96 class CWEntityXMLParser(datafeed.DataFeedXMLParser):
       
    97     """datafeed parser for the 'xml' entity view
       
    98 
       
    99     Most of the logic is delegated to the following components:
       
   100 
       
   101     * an "item builder" component, turning an etree xml node into a specific
       
   102       python dictionnary representing an entity
       
   103 
       
   104     * "action" components, selected given an entity, a relation and its role in
       
   105       the relation, and responsible to link the entity to given related items
       
   106       (eg dictionnary)
       
   107 
       
   108     So the parser is only doing the gluing service and the connection to the
       
   109     source.
       
   110     """
       
   111     __regid__ = 'cw.entityxml'
       
   112 
       
   113     def __init__(self, *args, **kwargs):
       
   114         super(CWEntityXMLParser, self).__init__(*args, **kwargs)
       
   115         self._parsed_urls = {}
       
   116         self._processed_entities = set()
       
   117 
       
   118     def select_linker(self, action, rtype, role, entity=None):
       
   119         try:
       
   120             return self._cw.vreg['components'].select(
       
   121                 'cw.entityxml.action.%s' % action, self._cw, entity=entity,
       
   122                 rtype=rtype, role=role, parser=self)
       
   123         except RegistryException:
       
   124             raise RegistryException('Unknown action %s' % action)
       
   125 
       
   126     def list_actions(self):
       
   127         reg = self._cw.vreg['components']
       
   128         return sorted(clss[0].action for rid, clss in reg.iteritems()
       
   129                       if rid.startswith('cw.entityxml.action.'))
       
   130 
       
   131     # mapping handling #########################################################
       
   132 
       
   133     def add_schema_config(self, schemacfg, checkonly=False):
       
   134         """added CWSourceSchemaConfig, modify mapping accordingly"""
       
   135         _ = self._cw._
       
   136         try:
       
   137             rtype = schemacfg.schema.rtype.name
       
   138         except AttributeError:
       
   139             msg = _("entity and relation types can't be mapped, only attributes "
       
   140                     "or relations")
       
   141             raise ValidationError(schemacfg.eid, {rn('cw_for_schema', 'subject'): msg})
       
   142         if schemacfg.options:
       
   143             options = text_to_dict(schemacfg.options)
       
   144         else:
       
   145             options = {}
       
   146         try:
       
   147             role = options.pop('role')
       
   148             if role not in ('subject', 'object'):
       
   149                 raise KeyError
       
   150         except KeyError:
       
   151             msg = _('"role=subject" or "role=object" must be specified in options')
       
   152             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   153         try:
       
   154             action = options.pop('action')
       
   155             linker = self.select_linker(action, rtype, role)
       
   156             linker.check_options(options, schemacfg.eid)
       
   157         except KeyError:
       
   158             msg = _('"action" must be specified in options; allowed values are '
       
   159                     '%s') % ', '.join(self.list_actions())
       
   160             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   161         except RegistryException:
       
   162             msg = _('allowed values for "action" are %s') % ', '.join(self.list_actions())
       
   163             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   164         if not checkonly:
       
   165             if role == 'subject':
       
   166                 etype = schemacfg.schema.stype.name
       
   167                 ttype = schemacfg.schema.otype.name
       
   168             else:
       
   169                 etype = schemacfg.schema.otype.name
       
   170                 ttype = schemacfg.schema.stype.name
       
   171             etyperules = self.source.mapping.setdefault(etype, {})
       
   172             etyperules.setdefault((rtype, role, action), []).append(
       
   173                 (ttype, options) )
       
   174             self.source.mapping_idx[schemacfg.eid] = (
       
   175                 etype, rtype, role, action, ttype)
       
   176 
       
   177     def del_schema_config(self, schemacfg, checkonly=False):
       
   178         """deleted CWSourceSchemaConfig, modify mapping accordingly"""
       
   179         etype, rtype, role, action, ttype = self.source.mapping_idx[schemacfg.eid]
       
   180         rules = self.source.mapping[etype][(rtype, role, action)]
       
   181         rules = [x for x in rules if not x[0] == ttype]
       
   182         if not rules:
       
   183             del self.source.mapping[etype][(rtype, role, action)]
       
   184 
       
   185     # import handling ##########################################################
       
   186 
       
   187     def process(self, url, raise_on_error=False, partialcommit=True):
       
   188         """IDataFeedParser main entry point"""
       
   189         if url.startswith('http'): # XXX similar loose test as in parse of sources.datafeed
       
   190             url = self.complete_url(url)
       
   191         super(CWEntityXMLParser, self).process(url, raise_on_error, partialcommit)
       
   192 
       
   193     def parse_etree(self, parent):
       
   194         for node in list(parent):
       
   195             builder = self._cw.vreg['components'].select(
       
   196                 'cw.entityxml.item-builder', self._cw, node=node,
       
   197                 parser=self)
       
   198             yield builder.build_item()
       
   199 
       
   200     def process_item(self, item, rels):
       
   201         """
       
   202         item and rels are what's returned by the item builder `build_item` method:
       
   203 
       
   204         * `item` is an {attribute: value} dictionary
       
   205         * `rels` is for relations and structured as
       
   206            {role: {relation: [(related item, related rels)...]}
       
   207         """
       
   208         entity = self.extid2entity(str(item['cwuri']),  item['cwtype'],
       
   209                                    cwsource=item['cwsource'], item=item)
       
   210         if entity is None:
       
   211             return None
       
   212         if entity.eid in self._processed_entities:
       
   213             return entity
       
   214         self._processed_entities.add(entity.eid)
       
   215         if not (self.created_during_pull(entity) or self.updated_during_pull(entity)):
       
   216             self.notify_updated(entity)
       
   217             attrs = extract_typed_attrs(entity.e_schema, item)
       
   218             # check modification date and compare attribute values to only
       
   219             # update what's actually needed
       
   220             entity.complete(tuple(attrs))
       
   221             mdate = attrs.get('modification_date')
       
   222             if not mdate or mdate > entity.modification_date:
       
   223                 attrs = dict( (k, v) for k, v in attrs.iteritems()
       
   224                               if v != getattr(entity, k))
       
   225                 if attrs:
       
   226                     entity.set_attributes(**attrs)
       
   227         self.process_relations(entity, rels)
       
   228         return entity
       
   229 
       
   230     def process_relations(self, entity, rels):
       
   231         etype = entity.__regid__
       
   232         for (rtype, role, action), rules in self.source.mapping.get(etype, {}).iteritems():
       
   233             try:
       
   234                 related_items = rels[role][rtype]
       
   235             except KeyError:
       
   236                 self.import_log.record_error('relation %s-%s not found in xml export of %s'
       
   237                                              % (rtype, role, etype))
       
   238                 continue
       
   239             try:
       
   240                 linker = self.select_linker(action, rtype, role, entity)
       
   241             except RegistryException:
       
   242                 self.import_log.record_error('no linker for action %s' % action)
       
   243             else:
       
   244                 linker.link_items(related_items, rules)
       
   245 
       
   246     def before_entity_copy(self, entity, sourceparams):
       
   247         """IDataFeedParser callback"""
       
   248         attrs = extract_typed_attrs(entity.e_schema, sourceparams['item'])
       
   249         entity.cw_edited.update(attrs)
       
   250 
       
   251     def complete_url(self, url, etype=None, known_relations=None):
       
   252         """append to the url's query string information about relation that should
       
   253         be included in the resulting xml, according to source mapping.
       
   254 
       
   255         If etype is not specified, try to guess it using the last path part of
       
   256         the url, i.e. the format used by default in cubicweb to map all entities
       
   257         of a given type as in 'http://mysite.org/EntityType'.
       
   258 
       
   259         If `known_relations` is given, it should be a dictionary of already
       
   260         known relations, so they don't get queried again.
       
   261         """
       
   262         try:
       
   263             url, qs = url.split('?', 1)
       
   264         except ValueError:
       
   265             qs = ''
       
   266         params = parse_qs(qs)
       
   267         if not 'vid' in params:
       
   268             params['vid'] = ['xml']
       
   269         if etype is None:
       
   270             try:
       
   271                 etype = url.rsplit('/', 1)[1]
       
   272             except ValueError:
       
   273                 return url + '?' + self._cw.build_url_params(**params)
       
   274             try:
       
   275                 etype = self._cw.vreg.case_insensitive_etypes[etype.lower()]
       
   276             except KeyError:
       
   277                 return url + '?' + self._cw.build_url_params(**params)
       
   278         relations = params.setdefault('relation', [])
       
   279         for rtype, role, _ in self.source.mapping.get(etype, ()):
       
   280             if known_relations and rtype in known_relations.get('role', ()):
       
   281                 continue
       
   282             reldef = '%s-%s' % (rtype, role)
       
   283             if not reldef in relations:
       
   284                 relations.append(reldef)
       
   285         return url + '?' + self._cw.build_url_params(**params)
       
   286 
       
   287     def complete_item(self, item, rels):
       
   288         try:
       
   289             return self._parsed_urls[item['cwuri']]
       
   290         except KeyError:
       
   291             itemurl = self.complete_url(item['cwuri'], item['cwtype'], rels)
       
   292             item_rels = list(self.parse(itemurl))
       
   293             assert len(item_rels) == 1, 'url %s expected to bring back one '\
       
   294                    'and only one entity, got %s' % (itemurl, len(item_rels))
       
   295             self._parsed_urls[item['cwuri']] = item_rels[0]
       
   296             if rels:
       
   297                 # XXX (do it better) merge relations
       
   298                 new_rels = item_rels[0][1]
       
   299                 new_rels.get('subject', {}).update(rels.get('subject', {}))
       
   300                 new_rels.get('object', {}).update(rels.get('object', {}))
       
   301             return item_rels[0]
       
   302 
       
   303 
       
   304 class CWEntityXMLItemBuilder(Component):
       
   305     __regid__ = 'cw.entityxml.item-builder'
       
   306 
       
   307     def __init__(self, _cw, parser, node, **kwargs):
       
   308         super(CWEntityXMLItemBuilder, self).__init__(_cw, **kwargs)
       
   309         self.parser = parser
       
   310         self.node = node
       
   311 
       
   312     def build_item(self):
       
   313         """parse a XML document node and return two dictionaries defining (part
       
   314         of) an entity:
       
   315 
       
   316         - {attribute: value}
       
   317         - {role: {relation: [(related item, related rels)...]}
       
   318         """
       
   319         node = self.node
       
   320         item = dict(node.attrib.items())
       
   321         item['cwtype'] = unicode(node.tag)
       
   322         item.setdefault('cwsource', None)
       
   323         try:
       
   324             item['eid'] = typed_eid(item['eid'])
       
   325         except KeyError:
       
   326             # cw < 3.11 compat mode XXX
       
   327             item['eid'] = typed_eid(node.find('eid').text)
       
   328             item['cwuri'] = node.find('cwuri').text
       
   329         rels = {}
       
   330         for child in node:
       
   331             role = child.get('role')
       
   332             if role:
       
   333                 # relation
       
   334                 related = rels.setdefault(role, {}).setdefault(child.tag, [])
       
   335                 related += self.parser.parse_etree(child)
       
   336             elif child.text:
       
   337                 # attribute
       
   338                 item[child.tag] = unicode(child.text)
       
   339             else:
       
   340                 # None attribute (empty tag)
       
   341                 item[child.tag] = None
       
   342         return item, rels
       
   343 
       
   344 
       
   345 class CWEntityXMLActionCopy(Component):
       
   346     """implementation of cubicweb entity xml parser's'copy' action
       
   347 
       
   348     Takes no option.
       
   349     """
       
   350     __regid__ = 'cw.entityxml.action.copy'
       
   351 
       
   352     def __init__(self, _cw, parser, rtype, role, entity=None, **kwargs):
       
   353         super(CWEntityXMLActionCopy, self).__init__(_cw, **kwargs)
       
   354         self.parser = parser
       
   355         self.rtype = rtype
       
   356         self.role = role
       
   357         self.entity = entity
       
   358 
       
   359     @classproperty
       
   360     def action(cls):
       
   361         return cls.__regid__.rsplit('.', 1)[-1]
       
   362 
       
   363     def check_options(self, options, eid):
       
   364         self._check_no_options(options, eid)
       
   365 
       
   366     def _check_no_options(self, options, eid, msg=None):
       
   367         if options:
       
   368             if msg is None:
       
   369                 msg = self._cw._("'%s' action doesn't take any options") % self.action
       
   370             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   371 
       
   372     def link_items(self, others, rules):
       
   373         assert not any(x[1] for x in rules), "'copy' action takes no option"
       
   374         ttypes = frozenset([x[0] for x in rules])
       
   375         eids = [] # local eids
       
   376         for item, rels in others:
       
   377             if item['cwtype'] in ttypes:
       
   378                 item, rels = self.parser.complete_item(item, rels)
       
   379                 other_entity = self.parser.process_item(item, rels)
       
   380                 if other_entity is not None:
       
   381                     eids.append(other_entity.eid)
       
   382         if eids:
       
   383             self._set_relation(eids)
       
   384         else:
       
   385             self._clear_relation(ttypes)
       
   386 
       
   387     def _clear_relation(self, ttypes):
       
   388         if not self.parser.created_during_pull(self.entity):
       
   389             if len(ttypes) > 1:
       
   390                 typerestr = ', Y is IN(%s)' % ','.join(ttypes)
       
   391             else:
       
   392                 typerestr = ', Y is %s' % ','.join(ttypes)
       
   393             self._cw.execute('DELETE ' + rtype_role_rql(self.rtype, self.role) + typerestr,
       
   394                              {'x': self.entity.eid})
       
   395 
       
   396     def _set_relation(self, eids):
       
   397         assert eids
       
   398         rtype = self.rtype
       
   399         rqlbase = rtype_role_rql(rtype, self.role)
       
   400         eidstr = ','.join(str(eid) for eid in eids)
       
   401         self._cw.execute('DELETE %s, NOT Y eid IN (%s)' % (rqlbase, eidstr),
       
   402                          {'x': self.entity.eid})
       
   403         if self.role == 'object':
       
   404             rql = 'SET %s, Y eid IN (%s), NOT Y %s X' % (rqlbase, eidstr, rtype)
       
   405         else:
       
   406             rql = 'SET %s, Y eid IN (%s), NOT X %s Y' % (rqlbase, eidstr, rtype)
       
   407         self._cw.execute(rql, {'x': self.entity.eid})
       
   408 
       
   409 
       
   410 class CWEntityXMLActionLink(CWEntityXMLActionCopy):
       
   411     """implementation of cubicweb entity xml parser's'link' action
       
   412 
       
   413     requires a 'linkattr' option to control search of the linked entity.
       
   414     """
       
   415     __regid__ = 'cw.entityxml.action.link'
       
   416 
       
   417     def check_options(self, options, eid):
       
   418         if not 'linkattr' in options:
       
   419             msg = self._cw._("'%s' action requires 'linkattr' option") % self.action
       
   420             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   421 
       
   422     create_when_not_found = False
       
   423 
       
   424     def link_items(self, others, rules):
       
   425         for ttype, options in rules:
       
   426             searchattrs = splitstrip(options.get('linkattr', ''))
       
   427             self._related_link(ttype, others, searchattrs)
       
   428 
       
   429     def _related_link(self, ttype, others, searchattrs):
       
   430         def issubset(x,y):
       
   431             return all(z in y for z in x)
       
   432         eids = [] # local eids
       
   433         log = self.parser.import_log
       
   434         for item, rels in others:
       
   435             if item['cwtype'] != ttype:
       
   436                 continue
       
   437             if not issubset(searchattrs, item):
       
   438                 item, rels = self.parser.complete_item(item, rels)
       
   439                 if not issubset(searchattrs, item):
       
   440                     log.record_error('missing attribute, got %s expected keys %s'
       
   441                                      % (item, searchattrs))
       
   442                     continue
       
   443             # XXX str() needed with python < 2.6
       
   444             kwargs = dict((str(attr), item[attr]) for attr in searchattrs)
       
   445             targets = self._find_entities(item, kwargs)
       
   446             if len(targets) == 1:
       
   447                 entity = targets[0]
       
   448             elif not targets and self.create_when_not_found:
       
   449                 entity = self._cw.create_entity(item['cwtype'], **kwargs)
       
   450             else:
       
   451                 if len(targets) > 1:
       
   452                     log.record_error('ambiguous link: found %s entity %s with attributes %s'
       
   453                                      % (len(targets), item['cwtype'], kwargs))
       
   454                 else:
       
   455                     log.record_error('can not find %s entity with attributes %s'
       
   456                                      % (item['cwtype'], kwargs))
       
   457                 continue
       
   458             eids.append(entity.eid)
       
   459             self.parser.process_relations(entity, rels)
       
   460         if eids:
       
   461             self._set_relation(eids)
       
   462         else:
       
   463             self._clear_relation((ttype,))
       
   464 
       
   465     def _find_entities(self, item, kwargs):
       
   466         return tuple(self._cw.find_entities(item['cwtype'], **kwargs))
       
   467 
       
   468 
       
   469 class CWEntityXMLActionLinkInState(CWEntityXMLActionLink):
       
   470     """custom implementation of cubicweb entity xml parser's'link' action for
       
   471     in_state relation
       
   472     """
       
   473     __select__ = match_rtype('in_state')
       
   474 
       
   475     def check_options(self, options, eid):
       
   476         super(CWEntityXMLActionLinkInState, self).check_options(options, eid)
       
   477         if not 'name' in options['linkattr']:
       
   478             msg = self._cw._("'%s' action for in_state relation should at least have 'linkattr=name' option") % self.action
       
   479             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   480 
       
   481     def _find_entities(self, item, kwargs):
       
   482         assert 'name' in item # XXX else, complete_item
       
   483         state_name = item['name']
       
   484         wf = self.entity.cw_adapt_to('IWorkflowable').current_workflow
       
   485         state = wf.state_by_name(state_name)
       
   486         if state is None:
       
   487             return ()
       
   488         return (state,)
       
   489 
       
   490 
       
   491 class CWEntityXMLActionLinkOrCreate(CWEntityXMLActionLink):
       
   492     """implementation of cubicweb entity xml parser's'link-or-create' action
       
   493 
       
   494     requires a 'linkattr' option to control search of the linked entity.
       
   495     """
       
   496     __regid__ = 'cw.entityxml.action.link-or-create'
       
   497     create_when_not_found = True
       
   498 
       
   499 
       
   500 def registration_callback(vreg):
       
   501     vreg.register_all(globals().values(), __name__)
       
   502     global URL_MAPPING
       
   503     URL_MAPPING = {}
       
   504     if vreg.config.apphome:
       
   505         url_mapping_file = osp.join(vreg.config.apphome, 'urlmapping.py')
       
   506         if osp.exists(url_mapping_file):
       
   507             URL_MAPPING = eval(file(url_mapping_file).read())
       
   508             vreg.info('using url mapping %s from %s', URL_MAPPING, url_mapping_file)