23 import StringIO |
23 import StringIO |
24 from os.path import exists |
24 from os.path import exists |
25 from datetime import datetime, timedelta |
25 from datetime import datetime, timedelta |
26 from base64 import b64decode |
26 from base64 import b64decode |
27 from cookielib import CookieJar |
27 from cookielib import CookieJar |
28 |
28 import urlparse |
29 from lxml import etree |
29 from lxml import etree |
30 |
30 |
31 from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid |
31 from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid |
32 from cubicweb.server.repository import preprocess_inlined_relations |
32 from cubicweb.server.repository import preprocess_inlined_relations |
33 from cubicweb.server.sources import AbstractSource |
33 from cubicweb.server.sources import AbstractSource |
311 for mappedurl in URL_MAPPING: |
311 for mappedurl in URL_MAPPING: |
312 if url.startswith(mappedurl): |
312 if url.startswith(mappedurl): |
313 return url.replace(mappedurl, URL_MAPPING[mappedurl], 1) |
313 return url.replace(mappedurl, URL_MAPPING[mappedurl], 1) |
314 return url |
314 return url |
315 |
315 |
316 def retrieve_url(self, url, data=None, headers=None): |
316 def retrieve_url(self, url): |
317 """Return stream linked by the given url: |
317 """Return stream linked by the given url: |
318 * HTTP urls will be normalized (see :meth:`normalize_url`) |
318 * HTTP urls will be normalized (see :meth:`normalize_url`) |
319 * handle file:// URL |
319 * handle file:// URL |
320 * other will be considered as plain content, useful for testing purpose |
320 * other will be considered as plain content, useful for testing purpose |
321 """ |
321 |
322 if headers is None: |
322 For http URLs, it will try to find a cwclientlib config entry |
323 headers = {} |
323 (if available) and use it as requester. |
324 if url.startswith('http'): |
324 """ |
325 url = self.normalize_url(url) |
325 purl = urlparse.urlparse(url) |
326 if data: |
326 if purl.scheme == 'file': |
327 self.source.info('POST %s %s', url, data) |
327 return URLLibResponseAdapter(open(url[7:]), url) |
328 else: |
328 |
329 self.source.info('GET %s', url) |
329 url = self.normalize_url(url) |
330 req = urllib2.Request(url, data, headers) |
330 |
|
331 # first, try to use cwclientlib if it's available and if the |
|
332 # url matches a configuration entry in ~/.config/cwclientlibrc |
|
333 try: |
|
334 from cwclientlib import cwproxy_for |
|
335 # parse url again since it has been normalized |
|
336 cnx = cwproxy_for(url) |
|
337 cnx.timeout = self.source.http_timeout |
|
338 self.source.info('Using cwclientlib for %s' % url) |
|
339 resp = cnx.get(url) |
|
340 resp.raise_for_status() |
|
341 return URLLibResponseAdapter(StringIO.StringIO(resp.text), url) |
|
342 except (ImportError, ValueError) as exc: |
|
343 # ImportError: not available |
|
344 # ValueError: no config entry found |
|
345 self.source.debug(str(exc)) |
|
346 |
|
347 # no chance with cwclientlib, fall back to former implementation |
|
348 if purl.scheme in ('http', 'https'): |
|
349 self.source.info('GET %s', url) |
|
350 req = urllib2.Request(url) |
331 return _OPENER.open(req, timeout=self.source.http_timeout) |
351 return _OPENER.open(req, timeout=self.source.http_timeout) |
332 if url.startswith('file://'): |
352 |
333 return URLLibResponseAdapter(open(url[7:]), url) |
353 # url is probably plain content |
334 return URLLibResponseAdapter(StringIO.StringIO(url), url) |
354 return URLLibResponseAdapter(StringIO.StringIO(url), url) |
335 |
355 |
336 def add_schema_config(self, schemacfg, checkonly=False): |
356 def add_schema_config(self, schemacfg, checkonly=False): |
337 """added CWSourceSchemaConfig, modify mapping accordingly""" |
357 """added CWSourceSchemaConfig, modify mapping accordingly""" |
338 msg = schemacfg._cw._("this parser doesn't use a mapping") |
358 msg = schemacfg._cw._("this parser doesn't use a mapping") |
481 |
501 |
482 def process_item(self, *args, **kwargs): |
502 def process_item(self, *args, **kwargs): |
483 raise NotImplementedError |
503 raise NotImplementedError |
484 |
504 |
485 def is_deleted(self, extid, etype, eid): |
505 def is_deleted(self, extid, etype, eid): |
486 if extid.startswith('http'): |
506 if exitd.startswith('file://'): |
|
507 return exists(exitd[7:]) |
|
508 |
|
509 url = self.normalize_url(extid) |
|
510 # first, try to use cwclientlib if it's available and if the |
|
511 # url matches a configuration entry in ~/.config/cwclientlibrc |
|
512 try: |
|
513 from cwclientlib import cwproxy_for |
|
514 # parse url again since it has been normalized |
|
515 cnx = cwproxy_for(url) |
|
516 cnx.timeout = self.source.http_timeout |
|
517 self.source.info('Using cwclientlib for checking %s' % url) |
|
518 return cnx.get(url).status_code == 404 |
|
519 except (ImportError, ValueError) as exc: |
|
520 # ImportError: not available |
|
521 # ValueError: no config entry found |
|
522 self.source.debug(str(exc)) |
|
523 |
|
524 # no chance with cwclientlib, fall back to former implementation |
|
525 if urlparse.urlparse(url).scheme in ('http', 'https'): |
487 try: |
526 try: |
488 _OPENER.open(self.normalize_url(extid), # XXX HTTP HEAD request |
527 _OPENER.open(url, timeout=self.source.http_timeout) |
489 timeout=self.source.http_timeout) |
|
490 except urllib2.HTTPError as ex: |
528 except urllib2.HTTPError as ex: |
491 if ex.code == 404: |
529 if ex.code == 404: |
492 return True |
530 return True |
493 elif extid.startswith('file://'): |
|
494 return exists(extid[7:]) |
|
495 return False |
531 return False |
496 |
532 |
497 |
533 |
498 class URLLibResponseAdapter(object): |
534 class URLLibResponseAdapter(object): |
499 """Thin wrapper to be used to fake a value returned by urllib2.urlopen""" |
535 """Thin wrapper to be used to fake a value returned by urllib2.urlopen""" |