server/sources/datafeed.py
changeset 10516 4c59409220b6
parent 10143 2eb06486273a
child 10522 1660a0fa4f43
equal deleted inserted replaced
10515:70ed2067fdb5 10516:4c59409220b6
    23 import StringIO
    23 import StringIO
    24 from os.path import exists
    24 from os.path import exists
    25 from datetime import datetime, timedelta
    25 from datetime import datetime, timedelta
    26 from base64 import b64decode
    26 from base64 import b64decode
    27 from cookielib import CookieJar
    27 from cookielib import CookieJar
    28 
    28 import urlparse
    29 from lxml import etree
    29 from lxml import etree
    30 
    30 
    31 from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid
    31 from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid
    32 from cubicweb.server.repository import preprocess_inlined_relations
    32 from cubicweb.server.repository import preprocess_inlined_relations
    33 from cubicweb.server.sources import AbstractSource
    33 from cubicweb.server.sources import AbstractSource
   311         for mappedurl in URL_MAPPING:
   311         for mappedurl in URL_MAPPING:
   312             if url.startswith(mappedurl):
   312             if url.startswith(mappedurl):
   313                 return url.replace(mappedurl, URL_MAPPING[mappedurl], 1)
   313                 return url.replace(mappedurl, URL_MAPPING[mappedurl], 1)
   314         return url
   314         return url
   315 
   315 
   316     def retrieve_url(self, url, data=None, headers=None):
   316     def retrieve_url(self, url):
   317         """Return stream linked by the given url:
   317         """Return stream linked by the given url:
   318         * HTTP urls will be normalized (see :meth:`normalize_url`)
   318         * HTTP urls will be normalized (see :meth:`normalize_url`)
   319         * handle file:// URL
   319         * handle file:// URL
   320         * other will be considered as plain content, useful for testing purpose
   320         * other will be considered as plain content, useful for testing purpose
   321         """
   321 
   322         if headers is None:
   322         For http URLs, it will try to find a cwclientlib config entry
   323             headers = {}
   323         (if available) and use it as requester.
   324         if url.startswith('http'):
   324         """
   325             url = self.normalize_url(url)
   325         purl = urlparse.urlparse(url)
   326             if data:
   326         if purl.scheme == 'file':
   327                 self.source.info('POST %s %s', url, data)
   327             return URLLibResponseAdapter(open(url[7:]), url)
   328             else:
   328 
   329                 self.source.info('GET %s', url)
   329         url = self.normalize_url(url)
   330             req = urllib2.Request(url, data, headers)
   330 
       
   331         # first, try to use cwclientlib if it's available and if the
       
   332         # url matches a configuration entry in ~/.config/cwclientlibrc
       
   333         try:
       
   334             from cwclientlib import cwproxy_for
       
   335             # parse url again since it has been normalized
       
   336             cnx = cwproxy_for(url)
       
   337             cnx.timeout = self.source.http_timeout
       
   338             self.source.info('Using cwclientlib for %s' % url)
       
   339             resp = cnx.get(url)
       
   340             resp.raise_for_status()
       
   341             return URLLibResponseAdapter(StringIO.StringIO(resp.text), url)
       
   342         except (ImportError, ValueError) as exc:
       
   343             # ImportError: not available
       
   344             # ValueError: no config entry found
       
   345             self.source.debug(str(exc))
       
   346 
       
   347         # no chance with cwclientlib, fall back to former implementation
       
   348         if purl.scheme in ('http', 'https'):
       
   349             self.source.info('GET %s', url)
       
   350             req = urllib2.Request(url)
   331             return _OPENER.open(req, timeout=self.source.http_timeout)
   351             return _OPENER.open(req, timeout=self.source.http_timeout)
   332         if url.startswith('file://'):
   352 
   333             return URLLibResponseAdapter(open(url[7:]), url)
   353         # url is probably plain content
   334         return URLLibResponseAdapter(StringIO.StringIO(url), url)
   354         return URLLibResponseAdapter(StringIO.StringIO(url), url)
   335 
   355 
   336     def add_schema_config(self, schemacfg, checkonly=False):
   356     def add_schema_config(self, schemacfg, checkonly=False):
   337         """added CWSourceSchemaConfig, modify mapping accordingly"""
   357         """added CWSourceSchemaConfig, modify mapping accordingly"""
   338         msg = schemacfg._cw._("this parser doesn't use a mapping")
   358         msg = schemacfg._cw._("this parser doesn't use a mapping")
   481 
   501 
   482     def process_item(self, *args, **kwargs):
   502     def process_item(self, *args, **kwargs):
   483         raise NotImplementedError
   503         raise NotImplementedError
   484 
   504 
   485     def is_deleted(self, extid, etype, eid):
   505     def is_deleted(self, extid, etype, eid):
   486         if extid.startswith('http'):
   506         if exitd.startswith('file://'):
       
   507             return exists(exitd[7:])
       
   508 
       
   509         url = self.normalize_url(extid)
       
   510         # first, try to use cwclientlib if it's available and if the
       
   511         # url matches a configuration entry in ~/.config/cwclientlibrc
       
   512         try:
       
   513             from cwclientlib import cwproxy_for
       
   514             # parse url again since it has been normalized
       
   515             cnx = cwproxy_for(url)
       
   516             cnx.timeout = self.source.http_timeout
       
   517             self.source.info('Using cwclientlib for checking %s' % url)
       
   518             return cnx.get(url).status_code == 404
       
   519         except (ImportError, ValueError) as exc:
       
   520             # ImportError: not available
       
   521             # ValueError: no config entry found
       
   522             self.source.debug(str(exc))
       
   523 
       
   524         # no chance with cwclientlib, fall back to former implementation
       
   525         if urlparse.urlparse(url).scheme in ('http', 'https'):
   487             try:
   526             try:
   488                 _OPENER.open(self.normalize_url(extid), # XXX HTTP HEAD request
   527                 _OPENER.open(url, timeout=self.source.http_timeout)
   489                              timeout=self.source.http_timeout)
       
   490             except urllib2.HTTPError as ex:
   528             except urllib2.HTTPError as ex:
   491                 if ex.code == 404:
   529                 if ex.code == 404:
   492                     return True
   530                     return True
   493         elif extid.startswith('file://'):
       
   494             return exists(extid[7:])
       
   495         return False
   531         return False
   496 
   532 
   497 
   533 
   498 class URLLibResponseAdapter(object):
   534 class URLLibResponseAdapter(object):
   499     """Thin wrapper to be used to fake a value returned by urllib2.urlopen"""
   535     """Thin wrapper to be used to fake a value returned by urllib2.urlopen"""