# HG changeset patch # User David Douard # Date 1434015945 -7200 # Node ID 4c59409220b6d7a7b1cbeea0c6dc6b6cdb8d4f30 # Parent 70ed2067fdb5f4c31c1afb2c87f3a32b16ec9421 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849) HTTP based URLs will be handled by a CWProxy if: - cwclientlib is available and - the URL matches a cwclientlibrc config entry Otherwise, fallback to previous implementation. See https://www.cubicweb.org/project/cwclientlib :Warning: This comes with an API modification of DataFeedParser.retrieve_url: it used to accept data and headers arguments to build GET or POST (if data was given) queries, but this was not used by any known code and imply more complicated code. diff -r 70ed2067fdb5 -r 4c59409220b6 debian/control --- a/debian/control Thu Jun 11 10:17:41 2015 +0200 +++ b/debian/control Thu Jun 11 11:45:45 2015 +0200 @@ -58,9 +58,10 @@ | python-pysqlite2, python-passlib Recommends: - cubicweb-documentation (= ${source:Version}) + cubicweb-documentation (= ${source:Version}), Suggests: - python-zmq + python-zmq, + python-cwclientlib (>= 0.4.0), Description: server part of the CubicWeb framework CubicWeb is a semantic web application framework. . diff -r 70ed2067fdb5 -r 4c59409220b6 server/sources/datafeed.py --- a/server/sources/datafeed.py Thu Jun 11 10:17:41 2015 +0200 +++ b/server/sources/datafeed.py Thu Jun 11 11:45:45 2015 +0200 @@ -25,7 +25,7 @@ from datetime import datetime, timedelta from base64 import b64decode from cookielib import CookieJar - +import urlparse from lxml import etree from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid @@ -313,24 +313,44 @@ return url.replace(mappedurl, URL_MAPPING[mappedurl], 1) return url - def retrieve_url(self, url, data=None, headers=None): + def retrieve_url(self, url): """Return stream linked by the given url: * HTTP urls will be normalized (see :meth:`normalize_url`) * handle file:// URL * other will be considered as plain content, useful for testing purpose + + For http URLs, it will try to find a cwclientlib config entry + (if available) and use it as requester. """ - if headers is None: - headers = {} - if url.startswith('http'): - url = self.normalize_url(url) - if data: - self.source.info('POST %s %s', url, data) - else: - self.source.info('GET %s', url) - req = urllib2.Request(url, data, headers) + purl = urlparse.urlparse(url) + if purl.scheme == 'file': + return URLLibResponseAdapter(open(url[7:]), url) + + url = self.normalize_url(url) + + # first, try to use cwclientlib if it's available and if the + # url matches a configuration entry in ~/.config/cwclientlibrc + try: + from cwclientlib import cwproxy_for + # parse url again since it has been normalized + cnx = cwproxy_for(url) + cnx.timeout = self.source.http_timeout + self.source.info('Using cwclientlib for %s' % url) + resp = cnx.get(url) + resp.raise_for_status() + return URLLibResponseAdapter(StringIO.StringIO(resp.text), url) + except (ImportError, ValueError) as exc: + # ImportError: not available + # ValueError: no config entry found + self.source.debug(str(exc)) + + # no chance with cwclientlib, fall back to former implementation + if purl.scheme in ('http', 'https'): + self.source.info('GET %s', url) + req = urllib2.Request(url) return _OPENER.open(req, timeout=self.source.http_timeout) - if url.startswith('file://'): - return URLLibResponseAdapter(open(url[7:]), url) + + # url is probably plain content return URLLibResponseAdapter(StringIO.StringIO(url), url) def add_schema_config(self, schemacfg, checkonly=False): @@ -483,15 +503,31 @@ raise NotImplementedError def is_deleted(self, extid, etype, eid): - if extid.startswith('http'): + if exitd.startswith('file://'): + return exists(exitd[7:]) + + url = self.normalize_url(extid) + # first, try to use cwclientlib if it's available and if the + # url matches a configuration entry in ~/.config/cwclientlibrc + try: + from cwclientlib import cwproxy_for + # parse url again since it has been normalized + cnx = cwproxy_for(url) + cnx.timeout = self.source.http_timeout + self.source.info('Using cwclientlib for checking %s' % url) + return cnx.get(url).status_code == 404 + except (ImportError, ValueError) as exc: + # ImportError: not available + # ValueError: no config entry found + self.source.debug(str(exc)) + + # no chance with cwclientlib, fall back to former implementation + if urlparse.urlparse(url).scheme in ('http', 'https'): try: - _OPENER.open(self.normalize_url(extid), # XXX HTTP HEAD request - timeout=self.source.http_timeout) + _OPENER.open(url, timeout=self.source.http_timeout) except urllib2.HTTPError as ex: if ex.code == 404: return True - elif extid.startswith('file://'): - return exists(extid[7:]) return False