diff -r 058bb3dc685f -r 0b59724cb3f2 sobjects/cwxmlparser.py --- a/sobjects/cwxmlparser.py Mon Jan 04 18:40:30 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,500 +0,0 @@ -# copyright 2010-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved. -# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr -# -# This file is part of CubicWeb. -# -# CubicWeb is free software: you can redistribute it and/or modify it under the -# terms of the GNU Lesser General Public License as published by the Free -# Software Foundation, either version 2.1 of the License, or (at your option) -# any later version. -# -# CubicWeb is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more -# details. -# -# You should have received a copy of the GNU Lesser General Public License along -# with CubicWeb. If not, see . -"""datafeed parser for xml generated by cubicweb - -Example of mapping for CWEntityXMLParser:: - - {u'CWUser': { # EntityType - (u'in_group', u'subject', u'link'): [ # (rtype, role, action) - (u'CWGroup', {u'linkattr': u'name'})], # -> rules = [(EntityType, options), ...] - (u'tags', u'object', u'link-or-create'): [ # (...) - (u'Tag', {u'linkattr': u'name'})], # -> ... - (u'use_email', u'subject', u'copy'): [ # (...) - (u'EmailAddress', {})] # -> ... - } - } - -""" - -from datetime import datetime, time -import urllib - -from six import text_type -from six.moves.urllib.parse import urlparse, urlunparse, parse_qs, urlencode - -import pytz -from logilab.common.date import todate, totime -from logilab.common.textutils import splitstrip, text_to_dict -from logilab.common.decorators import classproperty - -from yams.constraints import BASE_CONVERTERS -from yams.schema import role_name as rn - -from cubicweb import ValidationError, RegistryException -from cubicweb.view import Component -from cubicweb.server.sources import datafeed -from cubicweb.server.hook import match_rtype - -# XXX see cubicweb.cwvreg.YAMS_TO_PY -# XXX see cubicweb.web.views.xmlrss.SERIALIZERS -DEFAULT_CONVERTERS = BASE_CONVERTERS.copy() -DEFAULT_CONVERTERS['String'] = text_type -DEFAULT_CONVERTERS['Password'] = lambda x: x.encode('utf8') -def convert_date(ustr): - return todate(datetime.strptime(ustr, '%Y-%m-%d')) -DEFAULT_CONVERTERS['Date'] = convert_date -def convert_datetime(ustr): - if '.' in ustr: # assume %Y-%m-%d %H:%M:%S.mmmmmm - ustr = ustr.split('.', 1)[0] - return datetime.strptime(ustr, '%Y-%m-%d %H:%M:%S') -DEFAULT_CONVERTERS['Datetime'] = convert_datetime -# XXX handle timezone, though this will be enough as TZDatetime are -# serialized without time zone by default (UTC time). See -# cw.web.views.xmlrss.SERIALIZERS. -def convert_tzdatetime(ustr): - date = convert_datetime(ustr) - date = date.replace(tzinfo=pytz.utc) - return date -DEFAULT_CONVERTERS['TZDatetime'] = convert_tzdatetime -def convert_time(ustr): - return totime(datetime.strptime(ustr, '%H:%M:%S')) -DEFAULT_CONVERTERS['Time'] = convert_time -DEFAULT_CONVERTERS['TZTime'] = convert_time -def convert_interval(ustr): - return time(seconds=int(ustr)) -DEFAULT_CONVERTERS['Interval'] = convert_interval - -def extract_typed_attrs(eschema, stringdict, converters=DEFAULT_CONVERTERS): - typeddict = {} - for rschema in eschema.subject_relations(): - if rschema.final and rschema in stringdict: - if rschema in ('eid', 'cwuri', 'cwtype', 'cwsource'): - continue - attrtype = eschema.destination(rschema) - value = stringdict[rschema] - if value is not None: - value = converters[attrtype](value) - typeddict[rschema.type] = value - return typeddict - -def rtype_role_rql(rtype, role): - if role == 'object': - return 'Y %s X WHERE X eid %%(x)s' % rtype - else: - return 'X %s Y WHERE X eid %%(x)s' % rtype - - -class CWEntityXMLParser(datafeed.DataFeedXMLParser): - """datafeed parser for the 'xml' entity view - - Most of the logic is delegated to the following components: - - * an "item builder" component, turning an etree xml node into a specific - python dictionary representing an entity - - * "action" components, selected given an entity, a relation and its role in - the relation, and responsible to link the entity to given related items - (eg dictionary) - - So the parser is only doing the gluing service and the connection to the - source. - """ - __regid__ = 'cw.entityxml' - - def __init__(self, *args, **kwargs): - super(CWEntityXMLParser, self).__init__(*args, **kwargs) - self._parsed_urls = {} - self._processed_entities = set() - - def select_linker(self, action, rtype, role, entity=None): - try: - return self._cw.vreg['components'].select( - 'cw.entityxml.action.%s' % action, self._cw, entity=entity, - rtype=rtype, role=role, parser=self) - except RegistryException: - raise RegistryException('Unknown action %s' % action) - - def list_actions(self): - reg = self._cw.vreg['components'] - return sorted(clss[0].action for rid, clss in reg.items() - if rid.startswith('cw.entityxml.action.')) - - # mapping handling ######################################################### - - def add_schema_config(self, schemacfg, checkonly=False): - """added CWSourceSchemaConfig, modify mapping accordingly""" - _ = self._cw._ - try: - rtype = schemacfg.schema.rtype.name - except AttributeError: - msg = _("entity and relation types can't be mapped, only attributes " - "or relations") - raise ValidationError(schemacfg.eid, {rn('cw_for_schema', 'subject'): msg}) - if schemacfg.options: - options = text_to_dict(schemacfg.options) - else: - options = {} - try: - role = options.pop('role') - if role not in ('subject', 'object'): - raise KeyError - except KeyError: - msg = _('"role=subject" or "role=object" must be specified in options') - raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg}) - try: - action = options.pop('action') - linker = self.select_linker(action, rtype, role) - linker.check_options(options, schemacfg.eid) - except KeyError: - msg = _('"action" must be specified in options; allowed values are ' - '%s') % ', '.join(self.list_actions()) - raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg}) - except RegistryException: - msg = _('allowed values for "action" are %s') % ', '.join(self.list_actions()) - raise ValidationError(schemacfg.eid, {rn('options', 'subject'): msg}) - if not checkonly: - if role == 'subject': - etype = schemacfg.schema.stype.name - ttype = schemacfg.schema.otype.name - else: - etype = schemacfg.schema.otype.name - ttype = schemacfg.schema.stype.name - etyperules = self.source.mapping.setdefault(etype, {}) - etyperules.setdefault((rtype, role, action), []).append( - (ttype, options)) - self.source.mapping_idx[schemacfg.eid] = ( - etype, rtype, role, action, ttype) - - def del_schema_config(self, schemacfg, checkonly=False): - """deleted CWSourceSchemaConfig, modify mapping accordingly""" - etype, rtype, role, action, ttype = self.source.mapping_idx[schemacfg.eid] - rules = self.source.mapping[etype][(rtype, role, action)] - rules = [x for x in rules if not x[0] == ttype] - if not rules: - del self.source.mapping[etype][(rtype, role, action)] - - # import handling ########################################################## - - def process(self, url, raise_on_error=False): - """IDataFeedParser main entry point""" - if url.startswith('http'): # XXX similar loose test as in parse of sources.datafeed - url = self.complete_url(url) - super(CWEntityXMLParser, self).process(url, raise_on_error) - - def parse_etree(self, parent): - for node in list(parent): - builder = self._cw.vreg['components'].select( - 'cw.entityxml.item-builder', self._cw, node=node, - parser=self) - yield builder.build_item() - - def process_item(self, item, rels, raise_on_error=False): - """ - item and rels are what's returned by the item builder `build_item` method: - - * `item` is an {attribute: value} dictionary - * `rels` is for relations and structured as - {role: {relation: [(related item, related rels)...]} - """ - entity = self.extid2entity(item['cwuri'].encode('ascii'), item['cwtype'], - cwsource=item['cwsource'], item=item, - raise_on_error=raise_on_error) - if entity is None: - return None - if entity.eid in self._processed_entities: - return entity - self._processed_entities.add(entity.eid) - if not (self.created_during_pull(entity) or self.updated_during_pull(entity)): - attrs = extract_typed_attrs(entity.e_schema, item) - self.update_if_necessary(entity, attrs) - self.process_relations(entity, rels) - return entity - - def process_relations(self, entity, rels): - etype = entity.cw_etype - for (rtype, role, action), rules in self.source.mapping.get(etype, {}).items(): - try: - related_items = rels[role][rtype] - except KeyError: - self.import_log.record_error('relation %s-%s not found in xml export of %s' - % (rtype, role, etype)) - continue - try: - linker = self.select_linker(action, rtype, role, entity) - except RegistryException: - self.import_log.record_error('no linker for action %s' % action) - else: - linker.link_items(related_items, rules) - - def before_entity_copy(self, entity, sourceparams): - """IDataFeedParser callback""" - attrs = extract_typed_attrs(entity.e_schema, sourceparams['item']) - entity.cw_edited.update(attrs) - - def normalize_url(self, url): - """overridden to add vid=xml if vid is not set in the qs""" - url = super(CWEntityXMLParser, self).normalize_url(url) - purl = urlparse(url) - if purl.scheme in ('http', 'https'): - params = parse_qs(purl.query) - if 'vid' not in params: - params['vid'] = ['xml'] - purl = list(purl) - purl[4] = urlencode(params, doseq=True) - return urlunparse(purl) - return url - - def complete_url(self, url, etype=None, known_relations=None): - """append to the url's query string information about relation that should - be included in the resulting xml, according to source mapping. - - If etype is not specified, try to guess it using the last path part of - the url, i.e. the format used by default in cubicweb to map all entities - of a given type as in 'http://mysite.org/EntityType'. - - If `known_relations` is given, it should be a dictionary of already - known relations, so they don't get queried again. - """ - purl = urlparse(url) - params = parse_qs(purl.query) - if etype is None: - etype = purl.path.split('/')[-1] - try: - etype = self._cw.vreg.case_insensitive_etypes[etype.lower()] - except KeyError: - return url - relations = params['relation'] = set(params.get('relation', ())) - for rtype, role, _ in self.source.mapping.get(etype, ()): - if known_relations and rtype in known_relations.get('role', ()): - continue - relations.add('%s-%s' % (rtype, role)) - purl = list(purl) - purl[4] = urlencode(params, doseq=True) - return urlunparse(purl) - - def complete_item(self, item, rels): - try: - return self._parsed_urls[item['cwuri']] - except KeyError: - itemurl = self.complete_url(item['cwuri'], item['cwtype'], rels) - item_rels = list(self.parse(itemurl)) - assert len(item_rels) == 1, 'url %s expected to bring back one '\ - 'and only one entity, got %s' % (itemurl, len(item_rels)) - self._parsed_urls[item['cwuri']] = item_rels[0] - if rels: - # XXX (do it better) merge relations - new_rels = item_rels[0][1] - new_rels.get('subject', {}).update(rels.get('subject', {})) - new_rels.get('object', {}).update(rels.get('object', {})) - return item_rels[0] - - -class CWEntityXMLItemBuilder(Component): - __regid__ = 'cw.entityxml.item-builder' - - def __init__(self, _cw, parser, node, **kwargs): - super(CWEntityXMLItemBuilder, self).__init__(_cw, **kwargs) - self.parser = parser - self.node = node - - def build_item(self): - """parse a XML document node and return two dictionaries defining (part - of) an entity: - - - {attribute: value} - - {role: {relation: [(related item, related rels)...]} - """ - node = self.node - item = dict(node.attrib.items()) - item['cwtype'] = text_type(node.tag) - item.setdefault('cwsource', None) - try: - item['eid'] = int(item['eid']) - except KeyError: - # cw < 3.11 compat mode XXX - item['eid'] = int(node.find('eid').text) - item['cwuri'] = node.find('cwuri').text - rels = {} - for child in node: - role = child.get('role') - if role: - # relation - related = rels.setdefault(role, {}).setdefault(child.tag, []) - related += self.parser.parse_etree(child) - elif child.text: - # attribute - item[child.tag] = text_type(child.text) - else: - # None attribute (empty tag) - item[child.tag] = None - return item, rels - - -class CWEntityXMLActionCopy(Component): - """implementation of cubicweb entity xml parser's'copy' action - - Takes no option. - """ - __regid__ = 'cw.entityxml.action.copy' - - def __init__(self, _cw, parser, rtype, role, entity=None, **kwargs): - super(CWEntityXMLActionCopy, self).__init__(_cw, **kwargs) - self.parser = parser - self.rtype = rtype - self.role = role - self.entity = entity - - @classproperty - def action(cls): - return cls.__regid__.rsplit('.', 1)[-1] - - def check_options(self, options, eid): - self._check_no_options(options, eid) - - def _check_no_options(self, options, eid, msg=None): - if options: - if msg is None: - msg = self._cw._("'%s' action doesn't take any options") % self.action - raise ValidationError(eid, {rn('options', 'subject'): msg}) - - def link_items(self, others, rules): - assert not any(x[1] for x in rules), "'copy' action takes no option" - ttypes = frozenset([x[0] for x in rules]) - eids = [] # local eids - for item, rels in others: - if item['cwtype'] in ttypes: - item, rels = self.parser.complete_item(item, rels) - other_entity = self.parser.process_item(item, rels) - if other_entity is not None: - eids.append(other_entity.eid) - if eids: - self._set_relation(eids) - else: - self._clear_relation(ttypes) - - def _clear_relation(self, ttypes): - if not self.parser.created_during_pull(self.entity): - if len(ttypes) > 1: - typerestr = ', Y is IN(%s)' % ','.join(ttypes) - else: - typerestr = ', Y is %s' % ','.join(ttypes) - self._cw.execute('DELETE ' + rtype_role_rql(self.rtype, self.role) + typerestr, - {'x': self.entity.eid}) - - def _set_relation(self, eids): - assert eids - rtype = self.rtype - rqlbase = rtype_role_rql(rtype, self.role) - eidstr = ','.join(str(eid) for eid in eids) - self._cw.execute('DELETE %s, NOT Y eid IN (%s)' % (rqlbase, eidstr), - {'x': self.entity.eid}) - if self.role == 'object': - rql = 'SET %s, Y eid IN (%s), NOT Y %s X' % (rqlbase, eidstr, rtype) - else: - rql = 'SET %s, Y eid IN (%s), NOT X %s Y' % (rqlbase, eidstr, rtype) - self._cw.execute(rql, {'x': self.entity.eid}) - - -class CWEntityXMLActionLink(CWEntityXMLActionCopy): - """implementation of cubicweb entity xml parser's'link' action - - requires a 'linkattr' option to control search of the linked entity. - """ - __regid__ = 'cw.entityxml.action.link' - - def check_options(self, options, eid): - if not 'linkattr' in options: - msg = self._cw._("'%s' action requires 'linkattr' option") % self.action - raise ValidationError(eid, {rn('options', 'subject'): msg}) - - create_when_not_found = False - - def link_items(self, others, rules): - for ttype, options in rules: - searchattrs = splitstrip(options.get('linkattr', '')) - self._related_link(ttype, others, searchattrs) - - def _related_link(self, ttype, others, searchattrs): - def issubset(x, y): - return all(z in y for z in x) - eids = [] # local eids - log = self.parser.import_log - for item, rels in others: - if item['cwtype'] != ttype: - continue - if not issubset(searchattrs, item): - item, rels = self.parser.complete_item(item, rels) - if not issubset(searchattrs, item): - log.record_error('missing attribute, got %s expected keys %s' - % (item, searchattrs)) - continue - # XXX str() needed with python < 2.6 - kwargs = dict((str(attr), item[attr]) for attr in searchattrs) - targets = self._find_entities(item, kwargs) - if len(targets) == 1: - entity = targets[0] - elif not targets and self.create_when_not_found: - entity = self._cw.create_entity(item['cwtype'], **kwargs) - else: - if len(targets) > 1: - log.record_error('ambiguous link: found %s entity %s with attributes %s' - % (len(targets), item['cwtype'], kwargs)) - else: - log.record_error('can not find %s entity with attributes %s' - % (item['cwtype'], kwargs)) - continue - eids.append(entity.eid) - self.parser.process_relations(entity, rels) - if eids: - self._set_relation(eids) - else: - self._clear_relation((ttype,)) - - def _find_entities(self, item, kwargs): - return tuple(self._cw.find(item['cwtype'], **kwargs).entities()) - - -class CWEntityXMLActionLinkInState(CWEntityXMLActionLink): - """custom implementation of cubicweb entity xml parser's'link' action for - in_state relation - """ - __select__ = match_rtype('in_state') - - def check_options(self, options, eid): - super(CWEntityXMLActionLinkInState, self).check_options(options, eid) - if not 'name' in options['linkattr']: - msg = self._cw._("'%s' action for in_state relation should at least have 'linkattr=name' option") % self.action - raise ValidationError(eid, {rn('options', 'subject'): msg}) - - def _find_entities(self, item, kwargs): - assert 'name' in item # XXX else, complete_item - state_name = item['name'] - wf = self.entity.cw_adapt_to('IWorkflowable').current_workflow - state = wf.state_by_name(state_name) - if state is None: - return () - return (state,) - - -class CWEntityXMLActionLinkOrCreate(CWEntityXMLActionLink): - """implementation of cubicweb entity xml parser's'link-or-create' action - - requires a 'linkattr' option to control search of the linked entity. - """ - __regid__ = 'cw.entityxml.action.link-or-create' - create_when_not_found = True