cubicweb/sobjects/cwxmlparser.py
changeset 11057 0b59724cb3f2
parent 11034 75d752e6daf7
child 11256 0d9105673ec9
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cubicweb/sobjects/cwxmlparser.py	Sat Jan 16 13:48:51 2016 +0100
@@ -0,0 +1,500 @@
+# copyright 2010-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
+#
+# This file is part of CubicWeb.
+#
+# CubicWeb is free software: you can redistribute it and/or modify it under the
+# terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
+"""datafeed parser for xml generated by cubicweb
+
+Example of mapping for CWEntityXMLParser::
+
+  {u'CWUser': {                                        # EntityType
+      (u'in_group', u'subject', u'link'): [            # (rtype, role, action)
+          (u'CWGroup', {u'linkattr': u'name'})],       #   -> rules = [(EntityType, options), ...]
+      (u'tags', u'object', u'link-or-create'): [       # (...)
+          (u'Tag', {u'linkattr': u'name'})],           #   -> ...
+      (u'use_email', u'subject', u'copy'): [           # (...)
+          (u'EmailAddress', {})]                       #   -> ...
+      }
+   }
+
+"""
+
+from datetime import datetime, time
+import urllib
+
+from six import text_type
+from six.moves.urllib.parse import urlparse, urlunparse, parse_qs, urlencode
+
+import pytz
+from logilab.common.date import todate, totime
+from logilab.common.textutils import splitstrip, text_to_dict
+from logilab.common.decorators import classproperty
+
+from yams.constraints import BASE_CONVERTERS
+from yams.schema import role_name as rn
+
+from cubicweb import ValidationError, RegistryException
+from cubicweb.view import Component
+from cubicweb.server.sources import datafeed
+from cubicweb.server.hook import match_rtype
+
+# XXX see cubicweb.cwvreg.YAMS_TO_PY
+# XXX see cubicweb.web.views.xmlrss.SERIALIZERS
+DEFAULT_CONVERTERS = BASE_CONVERTERS.copy()
+DEFAULT_CONVERTERS['String'] = text_type
+DEFAULT_CONVERTERS['Password'] = lambda x: x.encode('utf8')
+def convert_date(ustr):
+    return todate(datetime.strptime(ustr, '%Y-%m-%d'))
+DEFAULT_CONVERTERS['Date'] = convert_date
+def convert_datetime(ustr):
+    if '.' in ustr: # assume %Y-%m-%d %H:%M:%S.mmmmmm
+        ustr = ustr.split('.', 1)[0]
+    return datetime.strptime(ustr, '%Y-%m-%d %H:%M:%S')
+DEFAULT_CONVERTERS['Datetime'] = convert_datetime
+# XXX handle timezone, though this will be enough as TZDatetime are
+# serialized without time zone by default (UTC time). See
+# cw.web.views.xmlrss.SERIALIZERS.
+def convert_tzdatetime(ustr):
+    date = convert_datetime(ustr)
+    date = date.replace(tzinfo=pytz.utc)
+    return date
+DEFAULT_CONVERTERS['TZDatetime'] = convert_tzdatetime
+def convert_time(ustr):
+    return totime(datetime.strptime(ustr, '%H:%M:%S'))
+DEFAULT_CONVERTERS['Time'] = convert_time
+DEFAULT_CONVERTERS['TZTime'] = convert_time
+def convert_interval(ustr):
+    return time(seconds=int(ustr))
+DEFAULT_CONVERTERS['Interval'] = convert_interval
+
+def extract_typed_attrs(eschema, stringdict, converters=DEFAULT_CONVERTERS):
+    typeddict = {}
+    for rschema in eschema.subject_relations():
+        if rschema.final and rschema in stringdict:
+            if rschema in ('eid', 'cwuri', 'cwtype', 'cwsource'):
+                continue
+            attrtype = eschema.destination(rschema)
+            value = stringdict[rschema]
+            if value is not None:
+                value = converters[attrtype](value)
+            typeddict[rschema.type] = value
+    return typeddict
+
+def rtype_role_rql(rtype, role):
+    if role == 'object':
+        return 'Y %s X WHERE X eid %%(x)s' % rtype
+    else:
+        return 'X %s Y WHERE X eid %%(x)s' % rtype
+
+
+class CWEntityXMLParser(datafeed.DataFeedXMLParser):
+    """datafeed parser for the 'xml' entity view
+
+    Most of the logic is delegated to the following components:
+
+    * an "item builder" component, turning an etree xml node into a specific
+      python dictionary representing an entity
+
+    * "action" components, selected given an entity, a relation and its role in
+      the relation, and responsible to link the entity to given related items
+      (eg dictionary)
+
+    So the parser is only doing the gluing service and the connection to the
+    source.
+    """
+    __regid__ = 'cw.entityxml'
+
+    def __init__(self, *args, **kwargs):
+        super(CWEntityXMLParser, self).__init__(*args, **kwargs)
+        self._parsed_urls = {}
+        self._processed_entities = set()
+
+    def select_linker(self, action, rtype, role, entity=None):
+        try:
+            return self._cw.vreg['components'].select(
+                'cw.entityxml.action.%s' % action, self._cw, entity=entity,
+                rtype=rtype, role=role, parser=self)
+        except RegistryException:
+            raise RegistryException('Unknown action %s' % action)
+
+    def list_actions(self):
+        reg = self._cw.vreg['components']
+        return sorted(clss[0].action for rid, clss in reg.items()
+                      if rid.startswith('cw.entityxml.action.'))
+
+    # mapping handling #########################################################
+
+    def add_schema_config(self, schemacfg, checkonly=False):
+        """added CWSourceSchemaConfig, modify mapping accordingly"""
+        _ = self._cw._
+        try:
+            rtype = schemacfg.schema.rtype.name
+        except AttributeError:
+            msg = _("entity and relation types can't be mapped, only attributes "
+                    "or relations")
+            raise ValidationError(schemacfg.eid, {rn('cw_for_schema', 'subject'): msg})
+        if schemacfg.options:
+            options = text_to_dict(schemacfg.options)
+        else:
+            options = {}
+        try:
+            role = options.pop('role')
+            if role not in ('subject', 'object'):
+                raise KeyError
+        except KeyError:
+            msg = _('"role=subject" or "role=object" must be specified in options')
+            raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
+        try:
+            action = options.pop('action')
+            linker = self.select_linker(action, rtype, role)
+            linker.check_options(options, schemacfg.eid)
+        except KeyError:
+            msg = _('"action" must be specified in options; allowed values are '
+                    '%s') % ', '.join(self.list_actions())
+            raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
+        except RegistryException:
+            msg = _('allowed values for "action" are %s') % ', '.join(self.list_actions())
+            raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg})
+        if not checkonly:
+            if role == 'subject':
+                etype = schemacfg.schema.stype.name
+                ttype = schemacfg.schema.otype.name
+            else:
+                etype = schemacfg.schema.otype.name
+                ttype = schemacfg.schema.stype.name
+            etyperules = self.source.mapping.setdefault(etype, {})
+            etyperules.setdefault((rtype, role, action), []).append(
+                (ttype, options))
+            self.source.mapping_idx[schemacfg.eid] = (
+                etype, rtype, role, action, ttype)
+
+    def del_schema_config(self, schemacfg, checkonly=False):
+        """deleted CWSourceSchemaConfig, modify mapping accordingly"""
+        etype, rtype, role, action, ttype = self.source.mapping_idx[schemacfg.eid]
+        rules = self.source.mapping[etype][(rtype, role, action)]
+        rules = [x for x in rules if not x[0] == ttype]
+        if not rules:
+            del self.source.mapping[etype][(rtype, role, action)]
+
+    # import handling ##########################################################
+
+    def process(self, url, raise_on_error=False):
+        """IDataFeedParser main entry point"""
+        if url.startswith('http'): # XXX similar loose test as in parse of sources.datafeed
+            url = self.complete_url(url)
+        super(CWEntityXMLParser, self).process(url, raise_on_error)
+
+    def parse_etree(self, parent):
+        for node in list(parent):
+            builder = self._cw.vreg['components'].select(
+                'cw.entityxml.item-builder', self._cw, node=node,
+                parser=self)
+            yield builder.build_item()
+
+    def process_item(self, item, rels, raise_on_error=False):
+        """
+        item and rels are what's returned by the item builder `build_item` method:
+
+        * `item` is an {attribute: value} dictionary
+        * `rels` is for relations and structured as
+           {role: {relation: [(related item, related rels)...]}
+        """
+        entity = self.extid2entity(item['cwuri'].encode('ascii'), item['cwtype'],
+                                   cwsource=item['cwsource'], item=item,
+                                   raise_on_error=raise_on_error)
+        if entity is None:
+            return None
+        if entity.eid in self._processed_entities:
+            return entity
+        self._processed_entities.add(entity.eid)
+        if not (self.created_during_pull(entity) or self.updated_during_pull(entity)):
+            attrs = extract_typed_attrs(entity.e_schema, item)
+            self.update_if_necessary(entity, attrs)
+        self.process_relations(entity, rels)
+        return entity
+
+    def process_relations(self, entity, rels):
+        etype = entity.cw_etype
+        for (rtype, role, action), rules in self.source.mapping.get(etype, {}).items():
+            try:
+                related_items = rels[role][rtype]
+            except KeyError:
+                self.import_log.record_error('relation %s-%s not found in xml export of %s'
+                                             % (rtype, role, etype))
+                continue
+            try:
+                linker = self.select_linker(action, rtype, role, entity)
+            except RegistryException:
+                self.import_log.record_error('no linker for action %s' % action)
+            else:
+                linker.link_items(related_items, rules)
+
+    def before_entity_copy(self, entity, sourceparams):
+        """IDataFeedParser callback"""
+        attrs = extract_typed_attrs(entity.e_schema, sourceparams['item'])
+        entity.cw_edited.update(attrs)
+
+    def normalize_url(self, url):
+        """overridden to add vid=xml if vid is not set in the qs"""
+        url = super(CWEntityXMLParser, self).normalize_url(url)
+        purl = urlparse(url)
+        if purl.scheme in ('http', 'https'):
+            params = parse_qs(purl.query)
+            if 'vid' not in params:
+                params['vid'] = ['xml']
+                purl = list(purl)
+                purl[4] = urlencode(params, doseq=True)
+                return urlunparse(purl)
+        return url
+
+    def complete_url(self, url, etype=None, known_relations=None):
+        """append to the url's query string information about relation that should
+        be included in the resulting xml, according to source mapping.
+
+        If etype is not specified, try to guess it using the last path part of
+        the url, i.e. the format used by default in cubicweb to map all entities
+        of a given type as in 'http://mysite.org/EntityType'.
+
+        If `known_relations` is given, it should be a dictionary of already
+        known relations, so they don't get queried again.
+        """
+        purl = urlparse(url)
+        params = parse_qs(purl.query)
+        if etype is None:
+            etype = purl.path.split('/')[-1]
+        try:
+            etype = self._cw.vreg.case_insensitive_etypes[etype.lower()]
+        except KeyError:
+            return url
+        relations = params['relation'] = set(params.get('relation', ()))
+        for rtype, role, _ in self.source.mapping.get(etype, ()):
+            if known_relations and rtype in known_relations.get('role', ()):
+                continue
+            relations.add('%s-%s' % (rtype, role))
+        purl = list(purl)
+        purl[4] = urlencode(params, doseq=True)
+        return urlunparse(purl)
+
+    def complete_item(self, item, rels):
+        try:
+            return self._parsed_urls[item['cwuri']]
+        except KeyError:
+            itemurl = self.complete_url(item['cwuri'], item['cwtype'], rels)
+            item_rels = list(self.parse(itemurl))
+            assert len(item_rels) == 1, 'url %s expected to bring back one '\
+                   'and only one entity, got %s' % (itemurl, len(item_rels))
+            self._parsed_urls[item['cwuri']] = item_rels[0]
+            if rels:
+                # XXX (do it better) merge relations
+                new_rels = item_rels[0][1]
+                new_rels.get('subject', {}).update(rels.get('subject', {}))
+                new_rels.get('object', {}).update(rels.get('object', {}))
+            return item_rels[0]
+
+
+class CWEntityXMLItemBuilder(Component):
+    __regid__ = 'cw.entityxml.item-builder'
+
+    def __init__(self, _cw, parser, node, **kwargs):
+        super(CWEntityXMLItemBuilder, self).__init__(_cw, **kwargs)
+        self.parser = parser
+        self.node = node
+
+    def build_item(self):
+        """parse a XML document node and return two dictionaries defining (part
+        of) an entity:
+
+        - {attribute: value}
+        - {role: {relation: [(related item, related rels)...]}
+        """
+        node = self.node
+        item = dict(node.attrib.items())
+        item['cwtype'] = text_type(node.tag)
+        item.setdefault('cwsource', None)
+        try:
+            item['eid'] = int(item['eid'])
+        except KeyError:
+            # cw < 3.11 compat mode XXX
+            item['eid'] = int(node.find('eid').text)
+            item['cwuri'] = node.find('cwuri').text
+        rels = {}
+        for child in node:
+            role = child.get('role')
+            if role:
+                # relation
+                related = rels.setdefault(role, {}).setdefault(child.tag, [])
+                related += self.parser.parse_etree(child)
+            elif child.text:
+                # attribute
+                item[child.tag] = text_type(child.text)
+            else:
+                # None attribute (empty tag)
+                item[child.tag] = None
+        return item, rels
+
+
+class CWEntityXMLActionCopy(Component):
+    """implementation of cubicweb entity xml parser's'copy' action
+
+    Takes no option.
+    """
+    __regid__ = 'cw.entityxml.action.copy'
+
+    def __init__(self, _cw, parser, rtype, role, entity=None, **kwargs):
+        super(CWEntityXMLActionCopy, self).__init__(_cw, **kwargs)
+        self.parser = parser
+        self.rtype = rtype
+        self.role = role
+        self.entity = entity
+
+    @classproperty
+    def action(cls):
+        return cls.__regid__.rsplit('.', 1)[-1]
+
+    def check_options(self, options, eid):
+        self._check_no_options(options, eid)
+
+    def _check_no_options(self, options, eid, msg=None):
+        if options:
+            if msg is None:
+                msg = self._cw._("'%s' action doesn't take any options") % self.action
+            raise ValidationError(eid, {rn('options', 'subject'): msg})
+
+    def link_items(self, others, rules):
+        assert not any(x[1] for x in rules), "'copy' action takes no option"
+        ttypes = frozenset([x[0] for x in rules])
+        eids = [] # local eids
+        for item, rels in others:
+            if item['cwtype'] in ttypes:
+                item, rels = self.parser.complete_item(item, rels)
+                other_entity = self.parser.process_item(item, rels)
+                if other_entity is not None:
+                    eids.append(other_entity.eid)
+        if eids:
+            self._set_relation(eids)
+        else:
+            self._clear_relation(ttypes)
+
+    def _clear_relation(self, ttypes):
+        if not self.parser.created_during_pull(self.entity):
+            if len(ttypes) > 1:
+                typerestr = ', Y is IN(%s)' % ','.join(ttypes)
+            else:
+                typerestr = ', Y is %s' % ','.join(ttypes)
+            self._cw.execute('DELETE ' + rtype_role_rql(self.rtype, self.role) + typerestr,
+                             {'x': self.entity.eid})
+
+    def _set_relation(self, eids):
+        assert eids
+        rtype = self.rtype
+        rqlbase = rtype_role_rql(rtype, self.role)
+        eidstr = ','.join(str(eid) for eid in eids)
+        self._cw.execute('DELETE %s, NOT Y eid IN (%s)' % (rqlbase, eidstr),
+                         {'x': self.entity.eid})
+        if self.role == 'object':
+            rql = 'SET %s, Y eid IN (%s), NOT Y %s X' % (rqlbase, eidstr, rtype)
+        else:
+            rql = 'SET %s, Y eid IN (%s), NOT X %s Y' % (rqlbase, eidstr, rtype)
+        self._cw.execute(rql, {'x': self.entity.eid})
+
+
+class CWEntityXMLActionLink(CWEntityXMLActionCopy):
+    """implementation of cubicweb entity xml parser's'link' action
+
+    requires a 'linkattr' option to control search of the linked entity.
+    """
+    __regid__ = 'cw.entityxml.action.link'
+
+    def check_options(self, options, eid):
+        if not 'linkattr' in options:
+            msg = self._cw._("'%s' action requires 'linkattr' option") % self.action
+            raise ValidationError(eid, {rn('options', 'subject'): msg})
+
+    create_when_not_found = False
+
+    def link_items(self, others, rules):
+        for ttype, options in rules:
+            searchattrs = splitstrip(options.get('linkattr', ''))
+            self._related_link(ttype, others, searchattrs)
+
+    def _related_link(self, ttype, others, searchattrs):
+        def issubset(x, y):
+            return all(z in y for z in x)
+        eids = [] # local eids
+        log = self.parser.import_log
+        for item, rels in others:
+            if item['cwtype'] != ttype:
+                continue
+            if not issubset(searchattrs, item):
+                item, rels = self.parser.complete_item(item, rels)
+                if not issubset(searchattrs, item):
+                    log.record_error('missing attribute, got %s expected keys %s'
+                                     % (item, searchattrs))
+                    continue
+            # XXX str() needed with python < 2.6
+            kwargs = dict((str(attr), item[attr]) for attr in searchattrs)
+            targets = self._find_entities(item, kwargs)
+            if len(targets) == 1:
+                entity = targets[0]
+            elif not targets and self.create_when_not_found:
+                entity = self._cw.create_entity(item['cwtype'], **kwargs)
+            else:
+                if len(targets) > 1:
+                    log.record_error('ambiguous link: found %s entity %s with attributes %s'
+                                     % (len(targets), item['cwtype'], kwargs))
+                else:
+                    log.record_error('can not find %s entity with attributes %s'
+                                     % (item['cwtype'], kwargs))
+                continue
+            eids.append(entity.eid)
+            self.parser.process_relations(entity, rels)
+        if eids:
+            self._set_relation(eids)
+        else:
+            self._clear_relation((ttype,))
+
+    def _find_entities(self, item, kwargs):
+        return tuple(self._cw.find(item['cwtype'], **kwargs).entities())
+
+
+class CWEntityXMLActionLinkInState(CWEntityXMLActionLink):
+    """custom implementation of cubicweb entity xml parser's'link' action for
+    in_state relation
+    """
+    __select__ = match_rtype('in_state')
+
+    def check_options(self, options, eid):
+        super(CWEntityXMLActionLinkInState, self).check_options(options, eid)
+        if not 'name' in options['linkattr']:
+            msg = self._cw._("'%s' action for in_state relation should at least have 'linkattr=name' option") % self.action
+            raise ValidationError(eid, {rn('options', 'subject'): msg})
+
+    def _find_entities(self, item, kwargs):
+        assert 'name' in item # XXX else, complete_item
+        state_name = item['name']
+        wf = self.entity.cw_adapt_to('IWorkflowable').current_workflow
+        state = wf.state_by_name(state_name)
+        if state is None:
+            return ()
+        return (state,)
+
+
+class CWEntityXMLActionLinkOrCreate(CWEntityXMLActionLink):
+    """implementation of cubicweb entity xml parser's'link-or-create' action
+
+    requires a 'linkattr' option to control search of the linked entity.
+    """
+    __regid__ = 'cw.entityxml.action.link-or-create'
+    create_when_not_found = True