sobjects/cwxmlparser.py
changeset 11057 0b59724cb3f2
parent 11052 058bb3dc685f
child 11058 23eb30449fe5
equal deleted inserted replaced
11052:058bb3dc685f 11057:0b59724cb3f2
     1 # copyright 2010-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
       
     2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
       
     3 #
       
     4 # This file is part of CubicWeb.
       
     5 #
       
     6 # CubicWeb is free software: you can redistribute it and/or modify it under the
       
     7 # terms of the GNU Lesser General Public License as published by the Free
       
     8 # Software Foundation, either version 2.1 of the License, or (at your option)
       
     9 # any later version.
       
    10 #
       
    11 # CubicWeb is distributed in the hope that it will be useful, but WITHOUT
       
    12 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
       
    13 # FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
       
    14 # details.
       
    15 #
       
    16 # You should have received a copy of the GNU Lesser General Public License along
       
    17 # with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
       
    18 """datafeed parser for xml generated by cubicweb
       
    19 
       
    20 Example of mapping for CWEntityXMLParser::
       
    21 
       
    22   {u'CWUser': {                                        # EntityType
       
    23       (u'in_group', u'subject', u'link'): [            # (rtype, role, action)
       
    24           (u'CWGroup', {u'linkattr': u'name'})],       #   -> rules = [(EntityType, options), ...]
       
    25       (u'tags', u'object', u'link-or-create'): [       # (...)
       
    26           (u'Tag', {u'linkattr': u'name'})],           #   -> ...
       
    27       (u'use_email', u'subject', u'copy'): [           # (...)
       
    28           (u'EmailAddress', {})]                       #   -> ...
       
    29       }
       
    30    }
       
    31 
       
    32 """
       
    33 
       
    34 from datetime import datetime, time
       
    35 import urllib
       
    36 
       
    37 from six import text_type
       
    38 from six.moves.urllib.parse import urlparse, urlunparse, parse_qs, urlencode
       
    39 
       
    40 import pytz
       
    41 from logilab.common.date import todate, totime
       
    42 from logilab.common.textutils import splitstrip, text_to_dict
       
    43 from logilab.common.decorators import classproperty
       
    44 
       
    45 from yams.constraints import BASE_CONVERTERS
       
    46 from yams.schema import role_name as rn
       
    47 
       
    48 from cubicweb import ValidationError, RegistryException
       
    49 from cubicweb.view import Component
       
    50 from cubicweb.server.sources import datafeed
       
    51 from cubicweb.server.hook import match_rtype
       
    52 
       
    53 # XXX see cubicweb.cwvreg.YAMS_TO_PY
       
    54 # XXX see cubicweb.web.views.xmlrss.SERIALIZERS
       
    55 DEFAULT_CONVERTERS = BASE_CONVERTERS.copy()
       
    56 DEFAULT_CONVERTERS['String'] = text_type
       
    57 DEFAULT_CONVERTERS['Password'] = lambda x: x.encode('utf8')
       
    58 def convert_date(ustr):
       
    59     return todate(datetime.strptime(ustr, '%Y-%m-%d'))
       
    60 DEFAULT_CONVERTERS['Date'] = convert_date
       
    61 def convert_datetime(ustr):
       
    62     if '.' in ustr: # assume %Y-%m-%d %H:%M:%S.mmmmmm
       
    63         ustr = ustr.split('.', 1)[0]
       
    64     return datetime.strptime(ustr, '%Y-%m-%d %H:%M:%S')
       
    65 DEFAULT_CONVERTERS['Datetime'] = convert_datetime
       
    66 # XXX handle timezone, though this will be enough as TZDatetime are
       
    67 # serialized without time zone by default (UTC time). See
       
    68 # cw.web.views.xmlrss.SERIALIZERS.
       
    69 def convert_tzdatetime(ustr):
       
    70     date = convert_datetime(ustr)
       
    71     date = date.replace(tzinfo=pytz.utc)
       
    72     return date
       
    73 DEFAULT_CONVERTERS['TZDatetime'] = convert_tzdatetime
       
    74 def convert_time(ustr):
       
    75     return totime(datetime.strptime(ustr, '%H:%M:%S'))
       
    76 DEFAULT_CONVERTERS['Time'] = convert_time
       
    77 DEFAULT_CONVERTERS['TZTime'] = convert_time
       
    78 def convert_interval(ustr):
       
    79     return time(seconds=int(ustr))
       
    80 DEFAULT_CONVERTERS['Interval'] = convert_interval
       
    81 
       
    82 def extract_typed_attrs(eschema, stringdict, converters=DEFAULT_CONVERTERS):
       
    83     typeddict = {}
       
    84     for rschema in eschema.subject_relations():
       
    85         if rschema.final and rschema in stringdict:
       
    86             if rschema in ('eid', 'cwuri', 'cwtype', 'cwsource'):
       
    87                 continue
       
    88             attrtype = eschema.destination(rschema)
       
    89             value = stringdict[rschema]
       
    90             if value is not None:
       
    91                 value = converters[attrtype](value)
       
    92             typeddict[rschema.type] = value
       
    93     return typeddict
       
    94 
       
    95 def rtype_role_rql(rtype, role):
       
    96     if role == 'object':
       
    97         return 'Y %s X WHERE X eid %%(x)s' % rtype
       
    98     else:
       
    99         return 'X %s Y WHERE X eid %%(x)s' % rtype
       
   100 
       
   101 
       
   102 class CWEntityXMLParser(datafeed.DataFeedXMLParser):
       
   103     """datafeed parser for the 'xml' entity view
       
   104 
       
   105     Most of the logic is delegated to the following components:
       
   106 
       
   107     * an "item builder" component, turning an etree xml node into a specific
       
   108       python dictionary representing an entity
       
   109 
       
   110     * "action" components, selected given an entity, a relation and its role in
       
   111       the relation, and responsible to link the entity to given related items
       
   112       (eg dictionary)
       
   113 
       
   114     So the parser is only doing the gluing service and the connection to the
       
   115     source.
       
   116     """
       
   117     __regid__ = 'cw.entityxml'
       
   118 
       
   119     def __init__(self, *args, **kwargs):
       
   120         super(CWEntityXMLParser, self).__init__(*args, **kwargs)
       
   121         self._parsed_urls = {}
       
   122         self._processed_entities = set()
       
   123 
       
   124     def select_linker(self, action, rtype, role, entity=None):
       
   125         try:
       
   126             return self._cw.vreg['components'].select(
       
   127                 'cw.entityxml.action.%s' % action, self._cw, entity=entity,
       
   128                 rtype=rtype, role=role, parser=self)
       
   129         except RegistryException:
       
   130             raise RegistryException('Unknown action %s' % action)
       
   131 
       
   132     def list_actions(self):
       
   133         reg = self._cw.vreg['components']
       
   134         return sorted(clss[0].action for rid, clss in reg.items()
       
   135                       if rid.startswith('cw.entityxml.action.'))
       
   136 
       
   137     # mapping handling #########################################################
       
   138 
       
   139     def add_schema_config(self, schemacfg, checkonly=False):
       
   140         """added CWSourceSchemaConfig, modify mapping accordingly"""
       
   141         _ = self._cw._
       
   142         try:
       
   143             rtype = schemacfg.schema.rtype.name
       
   144         except AttributeError:
       
   145             msg = _("entity and relation types can't be mapped, only attributes "
       
   146                     "or relations")
       
   147             raise ValidationError(schemacfg.eid, {rn('cw_for_schema', 'subject'): msg})
       
   148         if schemacfg.options:
       
   149             options = text_to_dict(schemacfg.options)
       
   150         else:
       
   151             options = {}
       
   152         try:
       
   153             role = options.pop('role')
       
   154             if role not in ('subject', 'object'):
       
   155                 raise KeyError
       
   156         except KeyError:
       
   157             msg = _('"role=subject" or "role=object" must be specified in options')
       
   158             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   159         try:
       
   160             action = options.pop('action')
       
   161             linker = self.select_linker(action, rtype, role)
       
   162             linker.check_options(options, schemacfg.eid)
       
   163         except KeyError:
       
   164             msg = _('"action" must be specified in options; allowed values are '
       
   165                     '%s') % ', '.join(self.list_actions())
       
   166             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   167         except RegistryException:
       
   168             msg = _('allowed values for "action" are %s') % ', '.join(self.list_actions())
       
   169             raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
       
   170         if not checkonly:
       
   171             if role == 'subject':
       
   172                 etype = schemacfg.schema.stype.name
       
   173                 ttype = schemacfg.schema.otype.name
       
   174             else:
       
   175                 etype = schemacfg.schema.otype.name
       
   176                 ttype = schemacfg.schema.stype.name
       
   177             etyperules = self.source.mapping.setdefault(etype, {})
       
   178             etyperules.setdefault((rtype, role, action), []).append(
       
   179                 (ttype, options))
       
   180             self.source.mapping_idx[schemacfg.eid] = (
       
   181                 etype, rtype, role, action, ttype)
       
   182 
       
   183     def del_schema_config(self, schemacfg, checkonly=False):
       
   184         """deleted CWSourceSchemaConfig, modify mapping accordingly"""
       
   185         etype, rtype, role, action, ttype = self.source.mapping_idx[schemacfg.eid]
       
   186         rules = self.source.mapping[etype][(rtype, role, action)]
       
   187         rules = [x for x in rules if not x[0] == ttype]
       
   188         if not rules:
       
   189             del self.source.mapping[etype][(rtype, role, action)]
       
   190 
       
   191     # import handling ##########################################################
       
   192 
       
   193     def process(self, url, raise_on_error=False):
       
   194         """IDataFeedParser main entry point"""
       
   195         if url.startswith('http'): # XXX similar loose test as in parse of sources.datafeed
       
   196             url = self.complete_url(url)
       
   197         super(CWEntityXMLParser, self).process(url, raise_on_error)
       
   198 
       
   199     def parse_etree(self, parent):
       
   200         for node in list(parent):
       
   201             builder = self._cw.vreg['components'].select(
       
   202                 'cw.entityxml.item-builder', self._cw, node=node,
       
   203                 parser=self)
       
   204             yield builder.build_item()
       
   205 
       
   206     def process_item(self, item, rels, raise_on_error=False):
       
   207         """
       
   208         item and rels are what's returned by the item builder `build_item` method:
       
   209 
       
   210         * `item` is an {attribute: value} dictionary
       
   211         * `rels` is for relations and structured as
       
   212            {role: {relation: [(related item, related rels)...]}
       
   213         """
       
   214         entity = self.extid2entity(item['cwuri'].encode('ascii'), item['cwtype'],
       
   215                                    cwsource=item['cwsource'], item=item,
       
   216                                    raise_on_error=raise_on_error)
       
   217         if entity is None:
       
   218             return None
       
   219         if entity.eid in self._processed_entities:
       
   220             return entity
       
   221         self._processed_entities.add(entity.eid)
       
   222         if not (self.created_during_pull(entity) or self.updated_during_pull(entity)):
       
   223             attrs = extract_typed_attrs(entity.e_schema, item)
       
   224             self.update_if_necessary(entity, attrs)
       
   225         self.process_relations(entity, rels)
       
   226         return entity
       
   227 
       
   228     def process_relations(self, entity, rels):
       
   229         etype = entity.cw_etype
       
   230         for (rtype, role, action), rules in self.source.mapping.get(etype, {}).items():
       
   231             try:
       
   232                 related_items = rels[role][rtype]
       
   233             except KeyError:
       
   234                 self.import_log.record_error('relation %s-%s not found in xml export of %s'
       
   235                                              % (rtype, role, etype))
       
   236                 continue
       
   237             try:
       
   238                 linker = self.select_linker(action, rtype, role, entity)
       
   239             except RegistryException:
       
   240                 self.import_log.record_error('no linker for action %s' % action)
       
   241             else:
       
   242                 linker.link_items(related_items, rules)
       
   243 
       
   244     def before_entity_copy(self, entity, sourceparams):
       
   245         """IDataFeedParser callback"""
       
   246         attrs = extract_typed_attrs(entity.e_schema, sourceparams['item'])
       
   247         entity.cw_edited.update(attrs)
       
   248 
       
   249     def normalize_url(self, url):
       
   250         """overridden to add vid=xml if vid is not set in the qs"""
       
   251         url = super(CWEntityXMLParser, self).normalize_url(url)
       
   252         purl = urlparse(url)
       
   253         if purl.scheme in ('http', 'https'):
       
   254             params = parse_qs(purl.query)
       
   255             if 'vid' not in params:
       
   256                 params['vid'] = ['xml']
       
   257                 purl = list(purl)
       
   258                 purl[4] = urlencode(params, doseq=True)
       
   259                 return urlunparse(purl)
       
   260         return url
       
   261 
       
   262     def complete_url(self, url, etype=None, known_relations=None):
       
   263         """append to the url's query string information about relation that should
       
   264         be included in the resulting xml, according to source mapping.
       
   265 
       
   266         If etype is not specified, try to guess it using the last path part of
       
   267         the url, i.e. the format used by default in cubicweb to map all entities
       
   268         of a given type as in 'http://mysite.org/EntityType'.
       
   269 
       
   270         If `known_relations` is given, it should be a dictionary of already
       
   271         known relations, so they don't get queried again.
       
   272         """
       
   273         purl = urlparse(url)
       
   274         params = parse_qs(purl.query)
       
   275         if etype is None:
       
   276             etype = purl.path.split('/')[-1]
       
   277         try:
       
   278             etype = self._cw.vreg.case_insensitive_etypes[etype.lower()]
       
   279         except KeyError:
       
   280             return url
       
   281         relations = params['relation'] = set(params.get('relation', ()))
       
   282         for rtype, role, _ in self.source.mapping.get(etype, ()):
       
   283             if known_relations and rtype in known_relations.get('role', ()):
       
   284                 continue
       
   285             relations.add('%s-%s' % (rtype, role))
       
   286         purl = list(purl)
       
   287         purl[4] = urlencode(params, doseq=True)
       
   288         return urlunparse(purl)
       
   289 
       
   290     def complete_item(self, item, rels):
       
   291         try:
       
   292             return self._parsed_urls[item['cwuri']]
       
   293         except KeyError:
       
   294             itemurl = self.complete_url(item['cwuri'], item['cwtype'], rels)
       
   295             item_rels = list(self.parse(itemurl))
       
   296             assert len(item_rels) == 1, 'url %s expected to bring back one '\
       
   297                    'and only one entity, got %s' % (itemurl, len(item_rels))
       
   298             self._parsed_urls[item['cwuri']] = item_rels[0]
       
   299             if rels:
       
   300                 # XXX (do it better) merge relations
       
   301                 new_rels = item_rels[0][1]
       
   302                 new_rels.get('subject', {}).update(rels.get('subject', {}))
       
   303                 new_rels.get('object', {}).update(rels.get('object', {}))
       
   304             return item_rels[0]
       
   305 
       
   306 
       
   307 class CWEntityXMLItemBuilder(Component):
       
   308     __regid__ = 'cw.entityxml.item-builder'
       
   309 
       
   310     def __init__(self, _cw, parser, node, **kwargs):
       
   311         super(CWEntityXMLItemBuilder, self).__init__(_cw, **kwargs)
       
   312         self.parser = parser
       
   313         self.node = node
       
   314 
       
   315     def build_item(self):
       
   316         """parse a XML document node and return two dictionaries defining (part
       
   317         of) an entity:
       
   318 
       
   319         - {attribute: value}
       
   320         - {role: {relation: [(related item, related rels)...]}
       
   321         """
       
   322         node = self.node
       
   323         item = dict(node.attrib.items())
       
   324         item['cwtype'] = text_type(node.tag)
       
   325         item.setdefault('cwsource', None)
       
   326         try:
       
   327             item['eid'] = int(item['eid'])
       
   328         except KeyError:
       
   329             # cw < 3.11 compat mode XXX
       
   330             item['eid'] = int(node.find('eid').text)
       
   331             item['cwuri'] = node.find('cwuri').text
       
   332         rels = {}
       
   333         for child in node:
       
   334             role = child.get('role')
       
   335             if role:
       
   336                 # relation
       
   337                 related = rels.setdefault(role, {}).setdefault(child.tag, [])
       
   338                 related += self.parser.parse_etree(child)
       
   339             elif child.text:
       
   340                 # attribute
       
   341                 item[child.tag] = text_type(child.text)
       
   342             else:
       
   343                 # None attribute (empty tag)
       
   344                 item[child.tag] = None
       
   345         return item, rels
       
   346 
       
   347 
       
   348 class CWEntityXMLActionCopy(Component):
       
   349     """implementation of cubicweb entity xml parser's'copy' action
       
   350 
       
   351     Takes no option.
       
   352     """
       
   353     __regid__ = 'cw.entityxml.action.copy'
       
   354 
       
   355     def __init__(self, _cw, parser, rtype, role, entity=None, **kwargs):
       
   356         super(CWEntityXMLActionCopy, self).__init__(_cw, **kwargs)
       
   357         self.parser = parser
       
   358         self.rtype = rtype
       
   359         self.role = role
       
   360         self.entity = entity
       
   361 
       
   362     @classproperty
       
   363     def action(cls):
       
   364         return cls.__regid__.rsplit('.', 1)[-1]
       
   365 
       
   366     def check_options(self, options, eid):
       
   367         self._check_no_options(options, eid)
       
   368 
       
   369     def _check_no_options(self, options, eid, msg=None):
       
   370         if options:
       
   371             if msg is None:
       
   372                 msg = self._cw._("'%s' action doesn't take any options") % self.action
       
   373             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   374 
       
   375     def link_items(self, others, rules):
       
   376         assert not any(x[1] for x in rules), "'copy' action takes no option"
       
   377         ttypes = frozenset([x[0] for x in rules])
       
   378         eids = [] # local eids
       
   379         for item, rels in others:
       
   380             if item['cwtype'] in ttypes:
       
   381                 item, rels = self.parser.complete_item(item, rels)
       
   382                 other_entity = self.parser.process_item(item, rels)
       
   383                 if other_entity is not None:
       
   384                     eids.append(other_entity.eid)
       
   385         if eids:
       
   386             self._set_relation(eids)
       
   387         else:
       
   388             self._clear_relation(ttypes)
       
   389 
       
   390     def _clear_relation(self, ttypes):
       
   391         if not self.parser.created_during_pull(self.entity):
       
   392             if len(ttypes) > 1:
       
   393                 typerestr = ', Y is IN(%s)' % ','.join(ttypes)
       
   394             else:
       
   395                 typerestr = ', Y is %s' % ','.join(ttypes)
       
   396             self._cw.execute('DELETE ' + rtype_role_rql(self.rtype, self.role) + typerestr,
       
   397                              {'x': self.entity.eid})
       
   398 
       
   399     def _set_relation(self, eids):
       
   400         assert eids
       
   401         rtype = self.rtype
       
   402         rqlbase = rtype_role_rql(rtype, self.role)
       
   403         eidstr = ','.join(str(eid) for eid in eids)
       
   404         self._cw.execute('DELETE %s, NOT Y eid IN (%s)' % (rqlbase, eidstr),
       
   405                          {'x': self.entity.eid})
       
   406         if self.role == 'object':
       
   407             rql = 'SET %s, Y eid IN (%s), NOT Y %s X' % (rqlbase, eidstr, rtype)
       
   408         else:
       
   409             rql = 'SET %s, Y eid IN (%s), NOT X %s Y' % (rqlbase, eidstr, rtype)
       
   410         self._cw.execute(rql, {'x': self.entity.eid})
       
   411 
       
   412 
       
   413 class CWEntityXMLActionLink(CWEntityXMLActionCopy):
       
   414     """implementation of cubicweb entity xml parser's'link' action
       
   415 
       
   416     requires a 'linkattr' option to control search of the linked entity.
       
   417     """
       
   418     __regid__ = 'cw.entityxml.action.link'
       
   419 
       
   420     def check_options(self, options, eid):
       
   421         if not 'linkattr' in options:
       
   422             msg = self._cw._("'%s' action requires 'linkattr' option") % self.action
       
   423             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   424 
       
   425     create_when_not_found = False
       
   426 
       
   427     def link_items(self, others, rules):
       
   428         for ttype, options in rules:
       
   429             searchattrs = splitstrip(options.get('linkattr', ''))
       
   430             self._related_link(ttype, others, searchattrs)
       
   431 
       
   432     def _related_link(self, ttype, others, searchattrs):
       
   433         def issubset(x, y):
       
   434             return all(z in y for z in x)
       
   435         eids = [] # local eids
       
   436         log = self.parser.import_log
       
   437         for item, rels in others:
       
   438             if item['cwtype'] != ttype:
       
   439                 continue
       
   440             if not issubset(searchattrs, item):
       
   441                 item, rels = self.parser.complete_item(item, rels)
       
   442                 if not issubset(searchattrs, item):
       
   443                     log.record_error('missing attribute, got %s expected keys %s'
       
   444                                      % (item, searchattrs))
       
   445                     continue
       
   446             # XXX str() needed with python < 2.6
       
   447             kwargs = dict((str(attr), item[attr]) for attr in searchattrs)
       
   448             targets = self._find_entities(item, kwargs)
       
   449             if len(targets) == 1:
       
   450                 entity = targets[0]
       
   451             elif not targets and self.create_when_not_found:
       
   452                 entity = self._cw.create_entity(item['cwtype'], **kwargs)
       
   453             else:
       
   454                 if len(targets) > 1:
       
   455                     log.record_error('ambiguous link: found %s entity %s with attributes %s'
       
   456                                      % (len(targets), item['cwtype'], kwargs))
       
   457                 else:
       
   458                     log.record_error('can not find %s entity with attributes %s'
       
   459                                      % (item['cwtype'], kwargs))
       
   460                 continue
       
   461             eids.append(entity.eid)
       
   462             self.parser.process_relations(entity, rels)
       
   463         if eids:
       
   464             self._set_relation(eids)
       
   465         else:
       
   466             self._clear_relation((ttype,))
       
   467 
       
   468     def _find_entities(self, item, kwargs):
       
   469         return tuple(self._cw.find(item['cwtype'], **kwargs).entities())
       
   470 
       
   471 
       
   472 class CWEntityXMLActionLinkInState(CWEntityXMLActionLink):
       
   473     """custom implementation of cubicweb entity xml parser's'link' action for
       
   474     in_state relation
       
   475     """
       
   476     __select__ = match_rtype('in_state')
       
   477 
       
   478     def check_options(self, options, eid):
       
   479         super(CWEntityXMLActionLinkInState, self).check_options(options, eid)
       
   480         if not 'name' in options['linkattr']:
       
   481             msg = self._cw._("'%s' action for in_state relation should at least have 'linkattr=name' option") % self.action
       
   482             raise ValidationError(eid, {rn('options', 'subject'): msg})
       
   483 
       
   484     def _find_entities(self, item, kwargs):
       
   485         assert 'name' in item # XXX else, complete_item
       
   486         state_name = item['name']
       
   487         wf = self.entity.cw_adapt_to('IWorkflowable').current_workflow
       
   488         state = wf.state_by_name(state_name)
       
   489         if state is None:
       
   490             return ()
       
   491         return (state,)
       
   492 
       
   493 
       
   494 class CWEntityXMLActionLinkOrCreate(CWEntityXMLActionLink):
       
   495     """implementation of cubicweb entity xml parser's'link-or-create' action
       
   496 
       
   497     requires a 'linkattr' option to control search of the linked entity.
       
   498     """
       
   499     __regid__ = 'cw.entityxml.action.link-or-create'
       
   500     create_when_not_found = True