[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
authorSylvain Thénault <sylvain.thenault@logilab.fr>
Thu, 02 Feb 2012 14:30:07 +0100
changeset 8187 981f6e487788
parent 8186 341c57b39dc9
child 8188 1867e252e487
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381 Parsers now have to implement the `is_deleted(extid, etype, eid)` method that should return a boolean telling if the given entity exists in the external source or not.
server/sources/datafeed.py
--- a/server/sources/datafeed.py	Tue Jan 31 11:49:15 2012 +0100
+++ b/server/sources/datafeed.py	Thu Feb 02 14:30:07 2012 +0100
@@ -1,4 +1,4 @@
-# copyright 2010-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# copyright 2010-2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
 #
 # This file is part of CubicWeb.
@@ -22,6 +22,7 @@
 
 import urllib2
 import StringIO
+from os.path import exists
 from datetime import datetime, timedelta
 from base64 import b64decode
 from cookielib import CookieJar
@@ -199,11 +200,11 @@
         if self.process_urls(parser, self.urls, raise_on_error):
             self.warning("some error occured, don't attempt to delete entities")
         elif self.config['delete-entities'] and myuris:
-            byetype = {}
-            for eid, etype in myuris.values():
-                byetype.setdefault(etype, []).append(str(eid))
-            self.error('delete %s entities %s', self.uri, byetype)
+            for extid, (eid, etype) in myuris.iteritems():
+                if parser.is_deleted(extid, etype, eid):
+                    byetype.setdefault(etype, []).append(str(eid))
             for etype, eids in byetype.iteritems():
+                self.warning('delete %s %s entities', len(eids), etype)
                 session.execute('DELETE %s X WHERE X eid IN (%s)'
                                 % (etype, ','.join(eids)))
         self.update_latest_retrieval(session)
@@ -276,6 +277,7 @@
         dataimport.init()
         return dataimport
 
+
 class DataFeedParser(AppObject):
     __registry__ = 'parsers'
 
@@ -287,6 +289,13 @@
         self.stats = {'created': set(),
                       'updated': set()}
 
+    def normalize_url(self, url):
+        from cubicweb.sobjects.parsers import URL_MAPPING
+        for mappedurl in URL_MAPPING:
+            if url.startswith(mappedurl):
+                return url.replace(mappedurl, URL_MAPPING[mappedurl], 1)
+        return url
+
     def add_schema_config(self, schemacfg, checkonly=False):
         """added CWSourceSchemaConfig, modify mapping accordingly"""
         msg = schemacfg._cw._("this parser doesn't use a mapping")
@@ -358,6 +367,12 @@
     def notify_updated(self, entity):
         return self.stats['updated'].add(entity.eid)
 
+    def is_deleted(self, extid, etype, eid):
+        """return True if the entity of given external id, entity type and eid
+        is actually deleted. Always return True by default, put more sensible
+        stuff in sub-classes.
+        """
+        return True
 
 class DataFeedXMLParser(DataFeedParser):
 
@@ -393,11 +408,7 @@
 
     def parse(self, url):
         if url.startswith('http'):
-            from cubicweb.sobjects.parsers import URL_MAPPING
-            for mappedurl in URL_MAPPING:
-                if url.startswith(mappedurl):
-                    url = url.replace(mappedurl, URL_MAPPING[mappedurl], 1)
-                    break
+            url = self.normalize_url(url)
             self.source.info('GET %s', url)
             stream = _OPENER.open(url)
         elif url.startswith('file://'):
@@ -412,6 +423,17 @@
     def process_item(self, *args):
         raise NotImplementedError
 
+    def is_deleted(self, extid, etype, eid):
+        if extid.startswith('http'):
+            try:
+                _OPENER.open(self.normalize_url(extid)) # XXX HTTP HEAD request
+            except urllib2.HTTPError, ex:
+                if ex.code == 404:
+                    return True
+        elif extid.startswith('file://'):
+            return exists(extid[7:])
+        return False
+
 # use a cookie enabled opener to use session cookie if any
 _OPENER = urllib2.build_opener()
 try: