[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
authorDavid Douard <david.douard@logilab.fr>
Thu, 11 Jun 2015 11:45:45 +0200
changeset 10516 4c59409220b6
parent 10515 70ed2067fdb5
child 10517 fa9a0c80556d
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849) HTTP based URLs will be handled by a CWProxy if: - cwclientlib is available and - the URL matches a cwclientlibrc config entry Otherwise, fallback to previous implementation. See https://www.cubicweb.org/project/cwclientlib :Warning: This comes with an API modification of DataFeedParser.retrieve_url: it used to accept data and headers arguments to build GET or POST (if data was given) queries, but this was not used by any known code and imply more complicated code.
debian/control
server/sources/datafeed.py
--- a/debian/control	Thu Jun 11 10:17:41 2015 +0200
+++ b/debian/control	Thu Jun 11 11:45:45 2015 +0200
@@ -58,9 +58,10 @@
  | python-pysqlite2,
  python-passlib
 Recommends:
- cubicweb-documentation (= ${source:Version})
+ cubicweb-documentation (= ${source:Version}),
 Suggests:
- python-zmq
+ python-zmq,
+ python-cwclientlib (>= 0.4.0),
 Description: server part of the CubicWeb framework
  CubicWeb is a semantic web application framework.
  .
--- a/server/sources/datafeed.py	Thu Jun 11 10:17:41 2015 +0200
+++ b/server/sources/datafeed.py	Thu Jun 11 11:45:45 2015 +0200
@@ -25,7 +25,7 @@
 from datetime import datetime, timedelta
 from base64 import b64decode
 from cookielib import CookieJar
-
+import urlparse
 from lxml import etree
 
 from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid
@@ -313,24 +313,44 @@
                 return url.replace(mappedurl, URL_MAPPING[mappedurl], 1)
         return url
 
-    def retrieve_url(self, url, data=None, headers=None):
+    def retrieve_url(self, url):
         """Return stream linked by the given url:
         * HTTP urls will be normalized (see :meth:`normalize_url`)
         * handle file:// URL
         * other will be considered as plain content, useful for testing purpose
+
+        For http URLs, it will try to find a cwclientlib config entry
+        (if available) and use it as requester.
         """
-        if headers is None:
-            headers = {}
-        if url.startswith('http'):
-            url = self.normalize_url(url)
-            if data:
-                self.source.info('POST %s %s', url, data)
-            else:
-                self.source.info('GET %s', url)
-            req = urllib2.Request(url, data, headers)
+        purl = urlparse.urlparse(url)
+        if purl.scheme == 'file':
+            return URLLibResponseAdapter(open(url[7:]), url)
+
+        url = self.normalize_url(url)
+
+        # first, try to use cwclientlib if it's available and if the
+        # url matches a configuration entry in ~/.config/cwclientlibrc
+        try:
+            from cwclientlib import cwproxy_for
+            # parse url again since it has been normalized
+            cnx = cwproxy_for(url)
+            cnx.timeout = self.source.http_timeout
+            self.source.info('Using cwclientlib for %s' % url)
+            resp = cnx.get(url)
+            resp.raise_for_status()
+            return URLLibResponseAdapter(StringIO.StringIO(resp.text), url)
+        except (ImportError, ValueError) as exc:
+            # ImportError: not available
+            # ValueError: no config entry found
+            self.source.debug(str(exc))
+
+        # no chance with cwclientlib, fall back to former implementation
+        if purl.scheme in ('http', 'https'):
+            self.source.info('GET %s', url)
+            req = urllib2.Request(url)
             return _OPENER.open(req, timeout=self.source.http_timeout)
-        if url.startswith('file://'):
-            return URLLibResponseAdapter(open(url[7:]), url)
+
+        # url is probably plain content
         return URLLibResponseAdapter(StringIO.StringIO(url), url)
 
     def add_schema_config(self, schemacfg, checkonly=False):
@@ -483,15 +503,31 @@
         raise NotImplementedError
 
     def is_deleted(self, extid, etype, eid):
-        if extid.startswith('http'):
+        if exitd.startswith('file://'):
+            return exists(exitd[7:])
+
+        url = self.normalize_url(extid)
+        # first, try to use cwclientlib if it's available and if the
+        # url matches a configuration entry in ~/.config/cwclientlibrc
+        try:
+            from cwclientlib import cwproxy_for
+            # parse url again since it has been normalized
+            cnx = cwproxy_for(url)
+            cnx.timeout = self.source.http_timeout
+            self.source.info('Using cwclientlib for checking %s' % url)
+            return cnx.get(url).status_code == 404
+        except (ImportError, ValueError) as exc:
+            # ImportError: not available
+            # ValueError: no config entry found
+            self.source.debug(str(exc))
+
+        # no chance with cwclientlib, fall back to former implementation
+        if urlparse.urlparse(url).scheme in ('http', 'https'):
             try:
-                _OPENER.open(self.normalize_url(extid), # XXX HTTP HEAD request
-                             timeout=self.source.http_timeout)
+                _OPENER.open(url, timeout=self.source.http_timeout)
             except urllib2.HTTPError as ex:
                 if ex.code == 404:
                     return True
-        elif extid.startswith('file://'):
-            return exists(extid[7:])
         return False