sobjects/cwxmlparser.py
changeset 8188 1867e252e487
parent 7995 9a9f35ef418c
child 8189 2ee0ef069fa7
equal deleted inserted replaced
8187:981f6e487788 8188:1867e252e487
       
     1 # copyright 2010-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
       
     2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
       
     3 #
       
     4 # This file is part of CubicWeb.
       
     5 #
       
     6 # CubicWeb is free software: you can redistribute it and/or modify it under the
       
     7 # terms of the GNU Lesser General Public License as published by the Free
       
     8 # Software Foundation, either version 2.1 of the License, or (at your option)
       
     9 # any later version.
       
    10 #
       
    11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT
       
    12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
       
    13 # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
       
    14 # details.
       
    15 #
       
    16 # You should have received a copy of the GNU Lesser General Public License along
       
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
       
    18 """datafeed parser for xml generated by cubicweb
       
    19 
       
    20 Example of mapping for CWEntityXMLParser::
       
    21 
       
    22   {u'CWUser': {                                        # EntityType
       
    23       (u'in_group', u'subject', u'link'): [            # (rtype, role, action)
       
    24           (u'CWGroup', {u'linkattr': u'name'})],       #   -> rules = [(EntityType, options), ...]
       
    25       (u'tags', u'object', u'link-or-create'): [       # (...)
       
    26           (u'Tag', {u'linkattr': u'name'})],           #   -> ...
       
    27       (u'use_email', u'subject', u'copy'): [           # (...)
       
    28           (u'EmailAddress', {})]                       #   -> ...
       
    29       }
       
    30    }
       
    31 
       
    32 """
       
    33 
       
    34 from datetime import datetime, timedelta, time
       
    35 from urllib import urlencode
       
    36 from cgi import parse_qs # in urlparse with python >= 2.6
       
    37 
       
    38 from logilab.common.date import todate, totime
       
    39 from logilab.common.textutils import splitstrip, text_to_dict
       
    40 from logilab.common.decorators import classproperty
       
    41 
       
    42 from yams.constraints import BASE_CONVERTERS
       
    43 from yams.schema import role_name as rn
       
    44 
       
    45 from cubicweb import ValidationError, RegistryException, typed_eid
       
    46 from cubicweb.view import Component
       
    47 from cubicweb.server.sources import datafeed
       
    48 from cubicweb.server.hook import match_rtype
       
    49 
       
    50 # XXX see cubicweb.cwvreg.YAMS_TO_PY
       
    51 # XXX see cubicweb.web.views.xmlrss.SERIALIZERS
       
    52 DEFAULT_CONVERTERS = BASE_CONVERTERS.copy()
       
    53 DEFAULT_CONVERTERS['String'] = unicode
       
    54 DEFAULT_CONVERTERS['Password'] = lambda x: x.encode('utf8')
       
    55 def convert_date(ustr):
       
    56     return todate(datetime.strptime(ustr, '%Y-%m-%d'))
       
    57 DEFAULT_CONVERTERS['Date'] = convert_date
       
    58 def convert_datetime(ustr):
       
    59     if '.' in ustr: # assume %Y-%m-%d %H:%M:%S.mmmmmm
       
    60         ustr = ustr.split('.',1)[0]
       
    61     return datetime.strptime(ustr, '%Y-%m-%d %H:%M:%S')
       
    62 DEFAULT_CONVERTERS['Datetime'] = convert_datetime
       
    63 # XXX handle timezone, though this will be enough as TZDatetime are
       
    64 # serialized without time zone by default (UTC time). See
       
    65 # cw.web.views.xmlrss.SERIALIZERS.
       
    66 DEFAULT_CONVERTERS['TZDatetime'] = convert_datetime
       
    67 def convert_time(ustr):
       
    68     return totime(datetime.strptime(ustr, '%H:%M:%S'))
       
    69 DEFAULT_CONVERTERS['Time'] = convert_time
       
    70 DEFAULT_CONVERTERS['TZTime'] = convert_time
       
    71 def convert_interval(ustr):
       
    72     return time(seconds=int(ustr))
       
    73 DEFAULT_CONVERTERS['Interval'] = convert_interval
       
    74 
       
    75 def extract_typed_attrs(eschema, stringdict, converters=DEFAULT_CONVERTERS):
       
    76     typeddict = {}
       
    77     for rschema in eschema.subject_relations():
       
    78         if rschema.final and rschema in stringdict:
       
    79             if rschema in ('eid', 'cwuri', 'cwtype', 'cwsource'):
       
    80                 continue
       
    81             attrtype = eschema.destination(rschema)
       
    82             value = stringdict[rschema]
       
    83             if value is not None:
       
    84                 value = converters[attrtype](value)
       
    85             typeddict[rschema.type] = value
       
    86     return typeddict
       
    87 
       
    88 def rtype_role_rql(rtype, role):
       
    89     if role == 'object':
       
    90         return 'Y %s X WHERE X eid %%(x)s' % rtype
       
    91     else:
       
    92         return 'X %s Y WHERE X eid %%(x)s' % rtype
       
    93 
       
    94 
       
    95 class CWEntityXMLParser(datafeed.DataFeedXMLParser):
       
    96     """datafeed parser for the 'xml' entity view
       
    97 
       
    98     Most of the logic is delegated to the following components:
       
    99 
       
   100     * an "item builder" component, turning an etree xml node into a specific
       
   101       python dictionnary representing an entity
       
   102 
       
   103     * "action" components, selected given an entity, a relation and its role in
       
   104       the relation, and responsible to link the entity to given related items
       
   105       (eg dictionnary)
       
   106 
       
   107     So the parser is only doing the gluing service and the connection to the
       
   108     source.
       
   109     """
       
   110     __regid__ = 'cw.entityxml'
       
   111 
       
   112     def __init__(self, *args, **kwargs):
       
   113         super(CWEntityXMLParser, self).__init__(*args, **kwargs)
       
   114         self._parsed_urls = {}
       
   115         self._processed_entities = set()
       
   116 
       
   117     def select_linker(self, action, rtype, role, entity=None):
       
   118         try:
       
   119             return self._cw.vreg['components'].select(
       
   120                 'cw.entityxml.action.%s' % action, self._cw, entity=entity,
       
   121                 rtype=rtype, role=role, parser=self)
       
   122         except RegistryException:
       
   123             raise RegistryException('Unknown action %s' % action)
       
   124 
       
   125     def list_actions(self):
       
   126         reg = self._cw.vreg['components']
       
   127         return sorted(clss[0].action for rid, clss in reg.iteritems()
       
   128                       if rid.startswith('cw.entityxml.action.'))
       
   129 
       
   130     # mapping handling #########################################################
       
   131 
       
   132     def add_schema_config(self, schemacfg, checkonly=False):
       
   133         """added CWSourceSchemaConfig, modify mapping accordingly"""
       
   134         _ = self._cw._
       
   135         try:
       
   136             rtype = schemacfg.schema.rtype.name
       
   137         except AttributeError:
       
   138             msg = _("entity and relation types can't be mapped, only attributes "
       
   139                     "or relations")
       
   140             raise ValidationError(schemacfg.eid, {rn('cw_for_schema', 'subject'): msg})
       
   141         if schemacfg.options:
       
   142             options = text_to_dict(schemacfg.options)
       
   143         else:
       
   144             options = {}
       
   145         try:
       
   146             role = options.pop('role')
       
   147             if role not in ('subject', 'object'):
       
   148                 raise KeyError
       
   149         except KeyError:
       
   150             msg = _('"role=subject" or "role=object" must be specified in options')
       
   151             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   152         try:
       
   153             action = options.pop('action')
       
   154             linker = self.select_linker(action, rtype, role)
       
   155             linker.check_options(options, schemacfg.eid)
       
   156         except KeyError:
       
   157             msg = _('"action" must be specified in options; allowed values are '
       
   158                     '%s') % ', '.join(self.list_actions())
       
   159             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   160         except RegistryException:
       
   161             msg = _('allowed values for "action" are %s') % ', '.join(self.list_actions())
       
   162             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   163         if not checkonly:
       
   164             if role == 'subject':
       
   165                 etype = schemacfg.schema.stype.name
       
   166                 ttype = schemacfg.schema.otype.name
       
   167             else:
       
   168                 etype = schemacfg.schema.otype.name
       
   169                 ttype = schemacfg.schema.stype.name
       
   170             etyperules = self.source.mapping.setdefault(etype, {})
       
   171             etyperules.setdefault((rtype, role, action), []).append(
       
   172                 (ttype, options) )
       
   173             self.source.mapping_idx[schemacfg.eid] = (
       
   174                 etype, rtype, role, action, ttype)
       
   175 
       
   176     def del_schema_config(self, schemacfg, checkonly=False):
       
   177         """deleted CWSourceSchemaConfig, modify mapping accordingly"""
       
   178         etype, rtype, role, action, ttype = self.source.mapping_idx[schemacfg.eid]
       
   179         rules = self.source.mapping[etype][(rtype, role, action)]
       
   180         rules = [x for x in rules if not x[0] == ttype]
       
   181         if not rules:
       
   182             del self.source.mapping[etype][(rtype, role, action)]
       
   183 
       
   184     # import handling ##########################################################
       
   185 
       
   186     def process(self, url, raise_on_error=False, partialcommit=True):
       
   187         """IDataFeedParser main entry point"""
       
   188         if url.startswith('http'): # XXX similar loose test as in parse of sources.datafeed
       
   189             url = self.complete_url(url)
       
   190         super(CWEntityXMLParser, self).process(url, raise_on_error, partialcommit)
       
   191 
       
   192     def parse_etree(self, parent):
       
   193         for node in list(parent):
       
   194             builder = self._cw.vreg['components'].select(
       
   195                 'cw.entityxml.item-builder', self._cw, node=node,
       
   196                 parser=self)
       
   197             yield builder.build_item()
       
   198 
       
   199     def process_item(self, item, rels):
       
   200         """
       
   201         item and rels are what's returned by the item builder `build_item` method:
       
   202 
       
   203         * `item` is an {attribute: value} dictionary
       
   204         * `rels` is for relations and structured as
       
   205            {role: {relation: [(related item, related rels)...]}
       
   206         """
       
   207         entity = self.extid2entity(str(item['cwuri']),  item['cwtype'],
       
   208                                    cwsource=item['cwsource'], item=item)
       
   209         if entity is None:
       
   210             return None
       
   211         if entity.eid in self._processed_entities:
       
   212             return entity
       
   213         self._processed_entities.add(entity.eid)
       
   214         if not (self.created_during_pull(entity) or self.updated_during_pull(entity)):
       
   215             attrs = extract_typed_attrs(entity.e_schema, item)
       
   216             self.update_if_necessary(entity, attrs)
       
   217         self.process_relations(entity, rels)
       
   218         return entity
       
   219 
       
   220     def process_relations(self, entity, rels):
       
   221         etype = entity.__regid__
       
   222         for (rtype, role, action), rules in self.source.mapping.get(etype, {}).iteritems():
       
   223             try:
       
   224                 related_items = rels[role][rtype]
       
   225             except KeyError:
       
   226                 self.import_log.record_error('relation %s-%s not found in xml export of %s'
       
   227                                              % (rtype, role, etype))
       
   228                 continue
       
   229             try:
       
   230                 linker = self.select_linker(action, rtype, role, entity)
       
   231             except RegistryException:
       
   232                 self.import_log.record_error('no linker for action %s' % action)
       
   233             else:
       
   234                 linker.link_items(related_items, rules)
       
   235 
       
   236     def before_entity_copy(self, entity, sourceparams):
       
   237         """IDataFeedParser callback"""
       
   238         attrs = extract_typed_attrs(entity.e_schema, sourceparams['item'])
       
   239         entity.cw_edited.update(attrs)
       
   240 
       
   241     def complete_url(self, url, etype=None, known_relations=None):
       
   242         """append to the url's query string information about relation that should
       
   243         be included in the resulting xml, according to source mapping.
       
   244 
       
   245         If etype is not specified, try to guess it using the last path part of
       
   246         the url, i.e. the format used by default in cubicweb to map all entities
       
   247         of a given type as in 'http://mysite.org/EntityType'.
       
   248 
       
   249         If `known_relations` is given, it should be a dictionary of already
       
   250         known relations, so they don't get queried again.
       
   251         """
       
   252         try:
       
   253             url, qs = url.split('?', 1)
       
   254         except ValueError:
       
   255             qs = ''
       
   256         params = parse_qs(qs)
       
   257         if not 'vid' in params:
       
   258             params['vid'] = ['xml']
       
   259         if etype is None:
       
   260             try:
       
   261                 etype = url.rsplit('/', 1)[1]
       
   262             except ValueError:
       
   263                 return url + '?' + self._cw.build_url_params(**params)
       
   264             try:
       
   265                 etype = self._cw.vreg.case_insensitive_etypes[etype.lower()]
       
   266             except KeyError:
       
   267                 return url + '?' + self._cw.build_url_params(**params)
       
   268         relations = params.setdefault('relation', [])
       
   269         for rtype, role, _ in self.source.mapping.get(etype, ()):
       
   270             if known_relations and rtype in known_relations.get('role', ()):
       
   271                 continue
       
   272             reldef = '%s-%s' % (rtype, role)
       
   273             if not reldef in relations:
       
   274                 relations.append(reldef)
       
   275         return url + '?' + self._cw.build_url_params(**params)
       
   276 
       
   277     def complete_item(self, item, rels):
       
   278         try:
       
   279             return self._parsed_urls[item['cwuri']]
       
   280         except KeyError:
       
   281             itemurl = self.complete_url(item['cwuri'], item['cwtype'], rels)
       
   282             item_rels = list(self.parse(itemurl))
       
   283             assert len(item_rels) == 1, 'url %s expected to bring back one '\
       
   284                    'and only one entity, got %s' % (itemurl, len(item_rels))
       
   285             self._parsed_urls[item['cwuri']] = item_rels[0]
       
   286             if rels:
       
   287                 # XXX (do it better) merge relations
       
   288                 new_rels = item_rels[0][1]
       
   289                 new_rels.get('subject', {}).update(rels.get('subject', {}))
       
   290                 new_rels.get('object', {}).update(rels.get('object', {}))
       
   291             return item_rels[0]
       
   292 
       
   293 
       
   294 class CWEntityXMLItemBuilder(Component):
       
   295     __regid__ = 'cw.entityxml.item-builder'
       
   296 
       
   297     def __init__(self, _cw, parser, node, **kwargs):
       
   298         super(CWEntityXMLItemBuilder, self).__init__(_cw, **kwargs)
       
   299         self.parser = parser
       
   300         self.node = node
       
   301 
       
   302     def build_item(self):
       
   303         """parse a XML document node and return two dictionaries defining (part
       
   304         of) an entity:
       
   305 
       
   306         - {attribute: value}
       
   307         - {role: {relation: [(related item, related rels)...]}
       
   308         """
       
   309         node = self.node
       
   310         item = dict(node.attrib.items())
       
   311         item['cwtype'] = unicode(node.tag)
       
   312         item.setdefault('cwsource', None)
       
   313         try:
       
   314             item['eid'] = typed_eid(item['eid'])
       
   315         except KeyError:
       
   316             # cw < 3.11 compat mode XXX
       
   317             item['eid'] = typed_eid(node.find('eid').text)
       
   318             item['cwuri'] = node.find('cwuri').text
       
   319         rels = {}
       
   320         for child in node:
       
   321             role = child.get('role')
       
   322             if role:
       
   323                 # relation
       
   324                 related = rels.setdefault(role, {}).setdefault(child.tag, [])
       
   325                 related += self.parser.parse_etree(child)
       
   326             elif child.text:
       
   327                 # attribute
       
   328                 item[child.tag] = unicode(child.text)
       
   329             else:
       
   330                 # None attribute (empty tag)
       
   331                 item[child.tag] = None
       
   332         return item, rels
       
   333 
       
   334 
       
   335 class CWEntityXMLActionCopy(Component):
       
   336     """implementation of cubicweb entity xml parser's'copy' action
       
   337 
       
   338     Takes no option.
       
   339     """
       
   340     __regid__ = 'cw.entityxml.action.copy'
       
   341 
       
   342     def __init__(self, _cw, parser, rtype, role, entity=None, **kwargs):
       
   343         super(CWEntityXMLActionCopy, self).__init__(_cw, **kwargs)
       
   344         self.parser = parser
       
   345         self.rtype = rtype
       
   346         self.role = role
       
   347         self.entity = entity
       
   348 
       
   349     @classproperty
       
   350     def action(cls):
       
   351         return cls.__regid__.rsplit('.', 1)[-1]
       
   352 
       
   353     def check_options(self, options, eid):
       
   354         self._check_no_options(options, eid)
       
   355 
       
   356     def _check_no_options(self, options, eid, msg=None):
       
   357         if options:
       
   358             if msg is None:
       
   359                 msg = self._cw._("'%s' action doesn't take any options") % self.action
       
   360             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   361 
       
   362     def link_items(self, others, rules):
       
   363         assert not any(x[1] for x in rules), "'copy' action takes no option"
       
   364         ttypes = frozenset([x[0] for x in rules])
       
   365         eids = [] # local eids
       
   366         for item, rels in others:
       
   367             if item['cwtype'] in ttypes:
       
   368                 item, rels = self.parser.complete_item(item, rels)
       
   369                 other_entity = self.parser.process_item(item, rels)
       
   370                 if other_entity is not None:
       
   371                     eids.append(other_entity.eid)
       
   372         if eids:
       
   373             self._set_relation(eids)
       
   374         else:
       
   375             self._clear_relation(ttypes)
       
   376 
       
   377     def _clear_relation(self, ttypes):
       
   378         if not self.parser.created_during_pull(self.entity):
       
   379             if len(ttypes) > 1:
       
   380                 typerestr = ', Y is IN(%s)' % ','.join(ttypes)
       
   381             else:
       
   382                 typerestr = ', Y is %s' % ','.join(ttypes)
       
   383             self._cw.execute('DELETE ' + rtype_role_rql(self.rtype, self.role) + typerestr,
       
   384                              {'x': self.entity.eid})
       
   385 
       
   386     def _set_relation(self, eids):
       
   387         assert eids
       
   388         rtype = self.rtype
       
   389         rqlbase = rtype_role_rql(rtype, self.role)
       
   390         eidstr = ','.join(str(eid) for eid in eids)
       
   391         self._cw.execute('DELETE %s, NOT Y eid IN (%s)' % (rqlbase, eidstr),
       
   392                          {'x': self.entity.eid})
       
   393         if self.role == 'object':
       
   394             rql = 'SET %s, Y eid IN (%s), NOT Y %s X' % (rqlbase, eidstr, rtype)
       
   395         else:
       
   396             rql = 'SET %s, Y eid IN (%s), NOT X %s Y' % (rqlbase, eidstr, rtype)
       
   397         self._cw.execute(rql, {'x': self.entity.eid})
       
   398 
       
   399 
       
   400 class CWEntityXMLActionLink(CWEntityXMLActionCopy):
       
   401     """implementation of cubicweb entity xml parser's'link' action
       
   402 
       
   403     requires a 'linkattr' option to control search of the linked entity.
       
   404     """
       
   405     __regid__ = 'cw.entityxml.action.link'
       
   406 
       
   407     def check_options(self, options, eid):
       
   408         if not 'linkattr' in options:
       
   409             msg = self._cw._("'%s' action requires 'linkattr' option") % self.action
       
   410             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   411 
       
   412     create_when_not_found = False
       
   413 
       
   414     def link_items(self, others, rules):
       
   415         for ttype, options in rules:
       
   416             searchattrs = splitstrip(options.get('linkattr', ''))
       
   417             self._related_link(ttype, others, searchattrs)
       
   418 
       
   419     def _related_link(self, ttype, others, searchattrs):
       
   420         def issubset(x,y):
       
   421             return all(z in y for z in x)
       
   422         eids = [] # local eids
       
   423         log = self.parser.import_log
       
   424         for item, rels in others:
       
   425             if item['cwtype'] != ttype:
       
   426                 continue
       
   427             if not issubset(searchattrs, item):
       
   428                 item, rels = self.parser.complete_item(item, rels)
       
   429                 if not issubset(searchattrs, item):
       
   430                     log.record_error('missing attribute, got %s expected keys %s'
       
   431                                      % (item, searchattrs))
       
   432                     continue
       
   433             # XXX str() needed with python < 2.6
       
   434             kwargs = dict((str(attr), item[attr]) for attr in searchattrs)
       
   435             targets = self._find_entities(item, kwargs)
       
   436             if len(targets) == 1:
       
   437                 entity = targets[0]
       
   438             elif not targets and self.create_when_not_found:
       
   439                 entity = self._cw.create_entity(item['cwtype'], **kwargs)
       
   440             else:
       
   441                 if len(targets) > 1:
       
   442                     log.record_error('ambiguous link: found %s entity %s with attributes %s'
       
   443                                      % (len(targets), item['cwtype'], kwargs))
       
   444                 else:
       
   445                     log.record_error('can not find %s entity with attributes %s'
       
   446                                      % (item['cwtype'], kwargs))
       
   447                 continue
       
   448             eids.append(entity.eid)
       
   449             self.parser.process_relations(entity, rels)
       
   450         if eids:
       
   451             self._set_relation(eids)
       
   452         else:
       
   453             self._clear_relation((ttype,))
       
   454 
       
   455     def _find_entities(self, item, kwargs):
       
   456         return tuple(self._cw.find_entities(item['cwtype'], **kwargs))
       
   457 
       
   458 
       
   459 class CWEntityXMLActionLinkInState(CWEntityXMLActionLink):
       
   460     """custom implementation of cubicweb entity xml parser's'link' action for
       
   461     in_state relation
       
   462     """
       
   463     __select__ = match_rtype('in_state')
       
   464 
       
   465     def check_options(self, options, eid):
       
   466         super(CWEntityXMLActionLinkInState, self).check_options(options, eid)
       
   467         if not 'name' in options['linkattr']:
       
   468             msg = self._cw._("'%s' action for in_state relation should at least have 'linkattr=name' option") % self.action
       
   469             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   470 
       
   471     def _find_entities(self, item, kwargs):
       
   472         assert 'name' in item # XXX else, complete_item
       
   473         state_name = item['name']
       
   474         wf = self.entity.cw_adapt_to('IWorkflowable').current_workflow
       
   475         state = wf.state_by_name(state_name)
       
   476         if state is None:
       
   477             return ()
       
   478         return (state,)
       
   479 
       
   480 
       
   481 class CWEntityXMLActionLinkOrCreate(CWEntityXMLActionLink):
       
   482     """implementation of cubicweb entity xml parser's'link-or-create' action
       
   483 
       
   484     requires a 'linkattr' option to control search of the linked entity.
       
   485     """
       
   486     __regid__ = 'cw.entityxml.action.link-or-create'
       
   487     create_when_not_found = True