[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
HTTP based URLs will be handled by a CWProxy if:
- cwclientlib is available and
- the URL matches a cwclientlibrc config entry
Otherwise, fallback to previous implementation.
See https://www.cubicweb.org/project/cwclientlib
:Warning:
This comes with an API modification of DataFeedParser.retrieve_url: it used
to accept data and headers arguments to build GET or POST (if data was given)
queries, but this was not used by any known code and imply more complicated
code.
--- a/debian/control Thu Jun 11 10:17:41 2015 +0200
+++ b/debian/control Thu Jun 11 11:45:45 2015 +0200
@@ -58,9 +58,10 @@
| python-pysqlite2,
python-passlib
Recommends:
- cubicweb-documentation (= ${source:Version})
+ cubicweb-documentation (= ${source:Version}),
Suggests:
- python-zmq
+ python-zmq,
+ python-cwclientlib (>= 0.4.0),
Description: server part of the CubicWeb framework
CubicWeb is a semantic web application framework.
.
--- a/server/sources/datafeed.py Thu Jun 11 10:17:41 2015 +0200
+++ b/server/sources/datafeed.py Thu Jun 11 11:45:45 2015 +0200
@@ -25,7 +25,7 @@
from datetime import datetime, timedelta
from base64 import b64decode
from cookielib import CookieJar
-
+import urlparse
from lxml import etree
from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid
@@ -313,24 +313,44 @@
return url.replace(mappedurl, URL_MAPPING[mappedurl], 1)
return url
- def retrieve_url(self, url, data=None, headers=None):
+ def retrieve_url(self, url):
"""Return stream linked by the given url:
* HTTP urls will be normalized (see :meth:`normalize_url`)
* handle file:// URL
* other will be considered as plain content, useful for testing purpose
+
+ For http URLs, it will try to find a cwclientlib config entry
+ (if available) and use it as requester.
"""
- if headers is None:
- headers = {}
- if url.startswith('http'):
- url = self.normalize_url(url)
- if data:
- self.source.info('POST %s %s', url, data)
- else:
- self.source.info('GET %s', url)
- req = urllib2.Request(url, data, headers)
+ purl = urlparse.urlparse(url)
+ if purl.scheme == 'file':
+ return URLLibResponseAdapter(open(url[7:]), url)
+
+ url = self.normalize_url(url)
+
+ # first, try to use cwclientlib if it's available and if the
+ # url matches a configuration entry in ~/.config/cwclientlibrc
+ try:
+ from cwclientlib import cwproxy_for
+ # parse url again since it has been normalized
+ cnx = cwproxy_for(url)
+ cnx.timeout = self.source.http_timeout
+ self.source.info('Using cwclientlib for %s' % url)
+ resp = cnx.get(url)
+ resp.raise_for_status()
+ return URLLibResponseAdapter(StringIO.StringIO(resp.text), url)
+ except (ImportError, ValueError) as exc:
+ # ImportError: not available
+ # ValueError: no config entry found
+ self.source.debug(str(exc))
+
+ # no chance with cwclientlib, fall back to former implementation
+ if purl.scheme in ('http', 'https'):
+ self.source.info('GET %s', url)
+ req = urllib2.Request(url)
return _OPENER.open(req, timeout=self.source.http_timeout)
- if url.startswith('file://'):
- return URLLibResponseAdapter(open(url[7:]), url)
+
+ # url is probably plain content
return URLLibResponseAdapter(StringIO.StringIO(url), url)
def add_schema_config(self, schemacfg, checkonly=False):
@@ -483,15 +503,31 @@
raise NotImplementedError
def is_deleted(self, extid, etype, eid):
- if extid.startswith('http'):
+ if exitd.startswith('file://'):
+ return exists(exitd[7:])
+
+ url = self.normalize_url(extid)
+ # first, try to use cwclientlib if it's available and if the
+ # url matches a configuration entry in ~/.config/cwclientlibrc
+ try:
+ from cwclientlib import cwproxy_for
+ # parse url again since it has been normalized
+ cnx = cwproxy_for(url)
+ cnx.timeout = self.source.http_timeout
+ self.source.info('Using cwclientlib for checking %s' % url)
+ return cnx.get(url).status_code == 404
+ except (ImportError, ValueError) as exc:
+ # ImportError: not available
+ # ValueError: no config entry found
+ self.source.debug(str(exc))
+
+ # no chance with cwclientlib, fall back to former implementation
+ if urlparse.urlparse(url).scheme in ('http', 'https'):
try:
- _OPENER.open(self.normalize_url(extid), # XXX HTTP HEAD request
- timeout=self.source.http_timeout)
+ _OPENER.open(url, timeout=self.source.http_timeout)
except urllib2.HTTPError as ex:
if ex.code == 404:
return True
- elif extid.startswith('file://'):
- return exists(extid[7:])
return False