# HG changeset patch # User Sylvain Thénault # Date 1328189407 -3600 # Node ID 981f6e487788247b173ab997c874f0e9790da130 # Parent 341c57b39dc93ce8be49255029633b6e1caccd1f [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381 Parsers now have to implement the `is_deleted(extid, etype, eid)` method that should return a boolean telling if the given entity exists in the external source or not. diff -r 341c57b39dc9 -r 981f6e487788 server/sources/datafeed.py --- a/server/sources/datafeed.py Tue Jan 31 11:49:15 2012 +0100 +++ b/server/sources/datafeed.py Thu Feb 02 14:30:07 2012 +0100 @@ -1,4 +1,4 @@ -# copyright 2010-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. +# copyright 2010-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved. # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr # # This file is part of CubicWeb. @@ -22,6 +22,7 @@ import urllib2 import StringIO +from os.path import exists from datetime import datetime, timedelta from base64 import b64decode from cookielib import CookieJar @@ -199,11 +200,11 @@ if self.process_urls(parser, self.urls, raise_on_error): self.warning("some error occured, don't attempt to delete entities") elif self.config['delete-entities'] and myuris: - byetype = {} - for eid, etype in myuris.values(): - byetype.setdefault(etype, []).append(str(eid)) - self.error('delete %s entities %s', self.uri, byetype) + for extid, (eid, etype) in myuris.iteritems(): + if parser.is_deleted(extid, etype, eid): + byetype.setdefault(etype, []).append(str(eid)) for etype, eids in byetype.iteritems(): + self.warning('delete %s %s entities', len(eids), etype) session.execute('DELETE %s X WHERE X eid IN (%s)' % (etype, ','.join(eids))) self.update_latest_retrieval(session) @@ -276,6 +277,7 @@ dataimport.init() return dataimport + class DataFeedParser(AppObject): __registry__ = 'parsers' @@ -287,6 +289,13 @@ self.stats = {'created': set(), 'updated': set()} + def normalize_url(self, url): + from cubicweb.sobjects.parsers import URL_MAPPING + for mappedurl in URL_MAPPING: + if url.startswith(mappedurl): + return url.replace(mappedurl, URL_MAPPING[mappedurl], 1) + return url + def add_schema_config(self, schemacfg, checkonly=False): """added CWSourceSchemaConfig, modify mapping accordingly""" msg = schemacfg._cw._("this parser doesn't use a mapping") @@ -358,6 +367,12 @@ def notify_updated(self, entity): return self.stats['updated'].add(entity.eid) + def is_deleted(self, extid, etype, eid): + """return True if the entity of given external id, entity type and eid + is actually deleted. Always return True by default, put more sensible + stuff in sub-classes. + """ + return True class DataFeedXMLParser(DataFeedParser): @@ -393,11 +408,7 @@ def parse(self, url): if url.startswith('http'): - from cubicweb.sobjects.parsers import URL_MAPPING - for mappedurl in URL_MAPPING: - if url.startswith(mappedurl): - url = url.replace(mappedurl, URL_MAPPING[mappedurl], 1) - break + url = self.normalize_url(url) self.source.info('GET %s', url) stream = _OPENER.open(url) elif url.startswith('file://'): @@ -412,6 +423,17 @@ def process_item(self, *args): raise NotImplementedError + def is_deleted(self, extid, etype, eid): + if extid.startswith('http'): + try: + _OPENER.open(self.normalize_url(extid)) # XXX HTTP HEAD request + except urllib2.HTTPError, ex: + if ex.code == 404: + return True + elif extid.startswith('file://'): + return exists(extid[7:]) + return False + # use a cookie enabled opener to use session cookie if any _OPENER = urllib2.build_opener() try: