cubicweb/server/sources/datafeed.py
author Sylvain Thénault <sylvain.thenault@logilab.fr>
Fri, 03 Nov 2017 16:31:59 +0100
changeset 12237 2dd0dcb2e5f9
parent 12149 649100470733
child 12346 b3f45d96a179
permissions -rw-r--r--
[test] Drop no more used "maxeid" based deletion in BaseQuerierTC and derived tests This is probably only necessary for QuerierTC itself, move it there and drop incantation from other derived classes to deactivate this feature.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
     1
# copyright 2010-2016 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     2
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     3
#
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     4
# This file is part of CubicWeb.
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     5
#
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     6
# CubicWeb is free software: you can redistribute it and/or modify it under the
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     7
# terms of the GNU Lesser General Public License as published by the Free
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     8
# Software Foundation, either version 2.1 of the License, or (at your option)
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
     9
# any later version.
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    10
#
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    11
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    12
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    13
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    14
# details.
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    15
#
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    16
# You should have received a copy of the GNU Lesser General Public License along
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    17
# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    18
"""datafeed sources: copy data from an external data stream into the system
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    19
database
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    20
"""
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
    21
10757
f73a9a884534 [py3k] io.BytesIO
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10662
diff changeset
    22
from io import BytesIO
8187
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
    23
from os.path import exists
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    24
from datetime import datetime, timedelta
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
    25
from functools import partial
10603
65ad6980976e [py3k] import URL mangling functions using six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10581
diff changeset
    26
65ad6980976e [py3k] import URL mangling functions using six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10581
diff changeset
    27
from six.moves.urllib.parse import urlparse
10610
d53b9c157f99 [py3k] import urllib2 from six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10603
diff changeset
    28
from six.moves.urllib.request import Request, build_opener, HTTPCookieProcessor
d53b9c157f99 [py3k] import urllib2 from six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10603
diff changeset
    29
from six.moves.urllib.error import HTTPError
10611
f4dec0cca9a1 [py3k] import CookieJar using six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10610
diff changeset
    30
from six.moves.http_cookiejar import CookieJar
10603
65ad6980976e [py3k] import URL mangling functions using six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10581
diff changeset
    31
11042
079b32f4cd0d [datafeed] use tz-aware datetime objects
Julien Cristau <julien.cristau@logilab.fr>
parents: 10914
diff changeset
    32
from pytz import utc
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
    33
from lxml import etree
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    34
10914
fed8bd56f223 [repository] deprecate the extid2eid based multi-sources API
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10805
diff changeset
    35
from logilab.common.deprecation import deprecated
fed8bd56f223 [repository] deprecate the extid2eid based multi-sources API
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10805
diff changeset
    36
12149
649100470733 [sources] Stop translating validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 12148
diff changeset
    37
from cubicweb import ObjectNotFound, ValidationError, SourceException, _
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    38
from cubicweb.server.sources import AbstractSource
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    39
from cubicweb.appobject import AppObject
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    40
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
    41
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    42
class DataFeedSource(AbstractSource):
7552
82dde8276a5b [datafeed, entities] url for entities from a datafeed source should be on their origin site. Closes #1769391
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7548
diff changeset
    43
    use_cwuri_as_url = True
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    44
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    45
    options = (
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    46
        ('synchronize',
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    47
         {'type' : 'yn',
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    48
          'default': True,
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    49
          'help': ('Is the repository responsible to automatically import '
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    50
                   'content from this source? '
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    51
                   'You should say yes unless you don\'t want this behaviour '
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    52
                   'or if you use a multiple repositories setup, in which '
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    53
                   'case you should say yes on one repository, no on others.'),
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    54
          'group': 'datafeed-source', 'level': 2,
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    55
          }),
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    56
        ('synchronization-interval',
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    57
         {'type' : 'time',
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    58
          'default': '5min',
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    59
          'help': ('Interval in seconds between synchronization with the '
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    60
                   'external source (default to 5 minutes, must be >= 1 min).'),
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    61
          'group': 'datafeed-source', 'level': 2,
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    62
          }),
7921
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    63
        ('max-lock-lifetime',
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    64
         {'type' : 'time',
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    65
          'default': '1h',
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    66
          'help': ('Maximum time allowed for a synchronization to be run. '
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    67
                   'Exceeded that time, the synchronization will be considered '
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    68
                   'as having failed and not properly released the lock, hence '
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    69
                   'it won\'t be considered'),
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    70
          'group': 'datafeed-source', 'level': 2,
a93e2ed5877a [datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7731
diff changeset
    71
          }),
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    72
        ('delete-entities',
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    73
         {'type' : 'yn',
8430
5bee87a14bb1 fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8429
diff changeset
    74
          'default': False,
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    75
          'help': ('Should already imported entities not found anymore on the '
11758
3f81636a75db [datafeed] Drop entity deletion handling in the default source / parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11757
diff changeset
    76
                   'external source be deleted? Handling of this parameter '
3f81636a75db [datafeed] Drop entity deletion handling in the default source / parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11757
diff changeset
    77
                   "will depend on source's parser."),
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    78
          'group': 'datafeed-source', 'level': 2,
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    79
          }),
7995
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
    80
        ('logs-lifetime',
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
    81
         {'type': 'time',
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
    82
          'default': '10d',
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
    83
          'help': ('Time before logs from datafeed imports are deleted.'),
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
    84
          'group': 'datafeed-source', 'level': 2,
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
    85
          }),
9182
75493f6ca586 [datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents: 8695
diff changeset
    86
        ('http-timeout',
75493f6ca586 [datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents: 8695
diff changeset
    87
         {'type': 'time',
75493f6ca586 [datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents: 8695
diff changeset
    88
          'default': '1min',
75493f6ca586 [datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents: 8695
diff changeset
    89
          'help': ('Timeout of HTTP GET requests, when synchronizing a source.'),
75493f6ca586 [datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents: 8695
diff changeset
    90
          'group': 'datafeed-source', 'level': 2,
75493f6ca586 [datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents: 8695
diff changeset
    91
          }),
9822
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
    92
        ('use-cwuri-as-url',
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
    93
         {'type': 'yn',
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
    94
          'default': None, # explicitly unset
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
    95
          'help': ('Use cwuri (i.e. external URL) for link to the entity '
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
    96
                   'instead of its local URL.'),
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
    97
          'group': 'datafeed-source', 'level': 1,
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
    98
          }),
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
    99
        )
8430
5bee87a14bb1 fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8429
diff changeset
   100
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   101
    def check_config(self, source_entity):
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   102
        """check configuration of source entity"""
8674
001c1592060a [repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8573
diff changeset
   103
        typed_config = super(DataFeedSource, self).check_config(source_entity)
001c1592060a [repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8573
diff changeset
   104
        if typed_config['synchronization-interval'] < 60:
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   105
            msg = _('synchronization-interval must be greater than 1 minute')
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   106
            raise ValidationError(source_entity.eid, {'config': msg})
8674
001c1592060a [repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8573
diff changeset
   107
        return typed_config
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   108
12148
79160d54662e [sources] Simplify source's init method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 12146
diff changeset
   109
    def init(self, source_entity):
79160d54662e [sources] Simplify source's init method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 12146
diff changeset
   110
        super(DataFeedSource, self).init(source_entity)
12143
a446124bcf3c [server] Drop update_config method of source
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11775
diff changeset
   111
        self.parser_id = source_entity.parser
12144
f54286c1cef5 [server] Inline _entity_update method into init method of AbstractSource
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 12143
diff changeset
   112
        self.latest_retrieval = source_entity.latest_retrieval
12143
a446124bcf3c [server] Drop update_config method of source
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11775
diff changeset
   113
        typed_config = self.config
8674
001c1592060a [repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8573
diff changeset
   114
        self.synchro_interval = timedelta(seconds=typed_config['synchronization-interval'])
001c1592060a [repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8573
diff changeset
   115
        self.max_lock_lifetime = timedelta(seconds=typed_config['max-lock-lifetime'])
9182
75493f6ca586 [datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents: 8695
diff changeset
   116
        self.http_timeout = typed_config['http-timeout']
9822
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
   117
        # if typed_config['use-cwuri-as-url'] is set, we have to update
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
   118
        # use_cwuri_as_url attribute and public configuration dictionary
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
   119
        # accordingly
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
   120
        if typed_config['use-cwuri-as-url'] is not None:
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
   121
            self.use_cwuri_as_url = typed_config['use-cwuri-as-url']
4a118bfd6ab4 [datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9746
diff changeset
   122
            self.public_config['use-cwuri-as-url'] = self.use_cwuri_as_url
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   123
9879
21278eb03bbf [datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 9860
diff changeset
   124
    def _get_parser(self, cnx, **kwargs):
10454
20f45a9b385c [datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents: 10143
diff changeset
   125
        if self.parser_id is None:
20f45a9b385c [datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents: 10143
diff changeset
   126
            self.warning('No parser defined on source %r', self)
20f45a9b385c [datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents: 10143
diff changeset
   127
            raise ObjectNotFound()
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   128
        return self.repo.vreg['parsers'].select(
9879
21278eb03bbf [datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 9860
diff changeset
   129
            self.parser_id, cnx, source=self, **kwargs)
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   130
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   131
    def fresh(self):
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   132
        if self.latest_retrieval is None:
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   133
            return False
11042
079b32f4cd0d [datafeed] use tz-aware datetime objects
Julien Cristau <julien.cristau@logilab.fr>
parents: 10914
diff changeset
   134
        return datetime.now(tz=utc) < (self.latest_retrieval + self.synchro_interval)
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   135
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   136
    def update_latest_retrieval(self, cnx):
11042
079b32f4cd0d [datafeed] use tz-aware datetime objects
Julien Cristau <julien.cristau@logilab.fr>
parents: 10914
diff changeset
   137
        self.latest_retrieval = datetime.now(tz=utc)
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   138
        cnx.execute('SET X latest_retrieval %(date)s WHERE X eid %(x)s',
9879
21278eb03bbf [datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 9860
diff changeset
   139
                    {'x': self.eid, 'date': self.latest_retrieval})
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   140
        cnx.commit()
7446
6fba86efdd09 [datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7444
diff changeset
   141
11345
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   142
    def acquire_synchronization_lock(self, cnx):
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   143
        # XXX race condition until WHERE of SET queries is executed using
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   144
        # 'SELECT FOR UPDATE'
11042
079b32f4cd0d [datafeed] use tz-aware datetime objects
Julien Cristau <julien.cristau@logilab.fr>
parents: 10914
diff changeset
   145
        now = datetime.now(tz=utc)
11345
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   146
        maxdt = now - self.max_lock_lifetime
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   147
        if not cnx.execute(
11125
e717da3dc164 c-c source-sync now actually force synchronization
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11057
diff changeset
   148
                'SET X in_synchronization %(now)s WHERE X eid %(x)s, '
e717da3dc164 c-c source-sync now actually force synchronization
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11057
diff changeset
   149
                'X in_synchronization NULL OR X in_synchronization < %(maxdt)s',
e717da3dc164 c-c source-sync now actually force synchronization
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11057
diff changeset
   150
                {'x': self.eid, 'now': now, 'maxdt': maxdt}):
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   151
            cnx.commit()
11345
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   152
            raise SourceException("a concurrent synchronization is already running")
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   153
        cnx.commit()
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   154
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   155
    def release_synchronization_lock(self, cnx):
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   156
        cnx.execute('SET X in_synchronization NULL WHERE X eid %(x)s',
9879
21278eb03bbf [datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 9860
diff changeset
   157
                    {'x': self.eid})
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   158
        cnx.commit()
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   159
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   160
    def pull_data(self, cnx, force=False, raise_on_error=False, async=False):
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   161
        """Launch synchronization of the source if needed.
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   162
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   163
        If `async` is true, the method return immediatly a dictionnary containing the import log's
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   164
        eid, and the actual synchronization is done asynchronously. If `async` is false, return some
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   165
        imports statistics (e.g. number of created and updated entities).
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   166
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   167
        This method is responsible to handle commit/rollback on the given connection.
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   168
        """
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   169
        if not force and self.fresh():
6972
12aa5cd81ce5 [datafeed] return empty dict when source is fresh avoid crash in the looping task because None returned
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 6957
diff changeset
   170
            return {}
11345
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   171
        try:
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   172
            self.acquire_synchronization_lock(cnx)
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   173
        except SourceException as exc:
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   174
            if force:
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   175
                raise
27b98f3cceae [datafeed] attempt to acquire synchronization lock even when force is given
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11255
diff changeset
   176
            self.error(str(exc))
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   177
            return {}
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   178
        try:
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   179
            if async:
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   180
                return self._async_pull_data(cnx, force, raise_on_error)
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   181
            else:
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   182
                return self._pull_data(cnx, force, raise_on_error)
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   183
        finally:
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   184
            cnx.rollback()  # rollback first in case there is some dirty transaction remaining
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   185
            self.release_synchronization_lock(cnx)
7456
c54038622fc9 [datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7447
diff changeset
   186
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   187
    def _async_pull_data(self, cnx, force, raise_on_error):
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   188
        import_log = cnx.create_entity('CWDataImport', cw_import_of=self)
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   189
        cnx.commit()  # commit the import log creation before starting the synchronize task
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   190
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   191
        def _synchronize_source(repo, source_eid, import_log_eid):
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   192
            with repo.internal_cnx() as cnx:
12146
d540defa0591 [server] Add source_by_eid and source_by_uri methods to repository
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 12144
diff changeset
   193
                source = repo.source_by_eid(source_eid)
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   194
                source._pull_data(cnx, force, raise_on_error, import_log_eid=import_log_eid)
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   195
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   196
        sync = partial(_synchronize_source, cnx.repo, self.eid, import_log.eid)
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   197
        cnx.repo.threaded_task(sync)
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   198
        return {'import_log_eid': import_log.eid}
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   199
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   200
    def _pull_data(self, cnx, force=False, raise_on_error=False, import_log_eid=None):
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   201
        importlog = self.init_import_log(cnx, import_log_eid)
10454
20f45a9b385c [datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents: 10143
diff changeset
   202
        try:
11758
3f81636a75db [datafeed] Drop entity deletion handling in the default source / parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11757
diff changeset
   203
            parser = self._get_parser(cnx, import_log=importlog)
10454
20f45a9b385c [datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents: 10143
diff changeset
   204
        except ObjectNotFound:
11740
dabbb2a4a493 [datafeed] Complete the import log even if parser could not be found
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11345
diff changeset
   205
            msg = 'failed to load parser for %s'
dabbb2a4a493 [datafeed] Complete the import log even if parser could not be found
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11345
diff changeset
   206
            importlog.record_error(msg % ('source "%s"' % self.uri))
dabbb2a4a493 [datafeed] Complete the import log even if parser could not be found
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11345
diff changeset
   207
            self.error(msg, self)
dabbb2a4a493 [datafeed] Complete the import log even if parser could not be found
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11345
diff changeset
   208
            stats = {}
8430
5bee87a14bb1 fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8429
diff changeset
   209
        else:
11740
dabbb2a4a493 [datafeed] Complete the import log even if parser could not be found
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11345
diff changeset
   210
            if parser.process_urls(self.urls, raise_on_error):
dabbb2a4a493 [datafeed] Complete the import log even if parser could not be found
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11345
diff changeset
   211
                self.warning("some error occurred, don't attempt to delete entities")
dabbb2a4a493 [datafeed] Complete the import log even if parser could not be found
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 11345
diff changeset
   212
            stats = parser.stats
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   213
        self.update_latest_retrieval(cnx)
7995
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
   214
        if stats.get('created'):
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
   215
            importlog.record_info('added %s entities' % len(stats['created']))
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
   216
        if stats.get('updated'):
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
   217
            importlog.record_info('updated %s entities' % len(stats['updated']))
9746
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   218
        importlog.write_log(cnx, end_timestamp=self.latest_retrieval)
81b56897a377 [datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9665
diff changeset
   219
        cnx.commit()
7995
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
   220
        return stats
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   221
11138
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   222
    def init_import_log(self, cnx, import_log_eid=None, **kwargs):
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   223
        if import_log_eid is None:
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   224
            import_log = cnx.create_entity('CWDataImport', cw_import_of=self,
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   225
                                           start_timestamp=datetime.now(tz=utc),
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   226
                                           **kwargs)
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   227
        else:
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   228
            import_log = cnx.entity_from_eid(import_log_eid)
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   229
            import_log.cw_set(start_timestamp=datetime.now(tz=utc), **kwargs)
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   230
        cnx.commit()  # make changes visible
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   231
        import_log.init()
78c8e64f3cef [sources] synchronize source asynchronously when started from the UI
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11129
diff changeset
   232
        return import_log
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   233
8187
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   234
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   235
class DataFeedParser(AppObject):
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   236
    __registry__ = 'parsers'
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   237
11758
3f81636a75db [datafeed] Drop entity deletion handling in the default source / parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11757
diff changeset
   238
    def __init__(self, cnx, source, import_log=None):
11255
58be5fe4a232 [datafeed] don't allow arbitrary kwargs on DatafeedParser initializer
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11254
diff changeset
   239
        super(DataFeedParser, self).__init__(cnx)
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   240
        self.source = source
7995
9a9f35ef418c Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents: 7950
diff changeset
   241
        self.import_log = import_log
8435
5064b6e0d6f4 [datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8434
diff changeset
   242
        self.stats = {'created': set(), 'updated': set(), 'checked': set()}
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   243
8187
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   244
    def normalize_url(self, url):
9823
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   245
        """Normalize an url by looking if there is a replacement for it in
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   246
        `cubicweb.sobjects.URL_MAPPING`.
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   247
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   248
        This dictionary allow to redirect from one host to another, which may be
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   249
        useful for example in case of test instance using production data, while
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   250
        you don't want to load the external source nor to hack your `/etc/hosts`
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   251
        file.
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   252
        """
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   253
        # local import mandatory, it's available after registration
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   254
        from cubicweb.sobjects import URL_MAPPING
8187
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   255
        for mappedurl in URL_MAPPING:
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   256
            if url.startswith(mappedurl):
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   257
                return url.replace(mappedurl, URL_MAPPING[mappedurl], 1)
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   258
        return url
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   259
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   260
    def retrieve_url(self, url):
9823
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   261
        """Return stream linked by the given url:
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   262
        * HTTP urls will be normalized (see :meth:`normalize_url`)
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   263
        * handle file:// URL
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   264
        * other will be considered as plain content, useful for testing purpose
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   265
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   266
        For http URLs, it will try to find a cwclientlib config entry
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   267
        (if available) and use it as requester.
9823
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   268
        """
10603
65ad6980976e [py3k] import URL mangling functions using six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10581
diff changeset
   269
        purl = urlparse(url)
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   270
        if purl.scheme == 'file':
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   271
            return URLLibResponseAdapter(open(url[7:]), url)
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   272
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   273
        url = self.normalize_url(url)
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   274
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   275
        # first, try to use cwclientlib if it's available and if the
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   276
        # url matches a configuration entry in ~/.config/cwclientlibrc
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   277
        try:
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   278
            from cwclientlib import cwproxy_for
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   279
            # parse url again since it has been normalized
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   280
            cnx = cwproxy_for(url)
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   281
            cnx.timeout = self.source.http_timeout
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   282
            self.source.info('Using cwclientlib for %s' % url)
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   283
            resp = cnx.get(url)
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   284
            resp.raise_for_status()
11055
3c1139344621 [datafeed] io.BytesIO requires a buffer, not a unicode (closes #9783743)
David Douard <david.douard@logilab.fr>
parents: 11042
diff changeset
   285
            return URLLibResponseAdapter(BytesIO(resp.content), url)
10532
2cc74c688eb9 [datafeed] also catch EnvironmentError when trying to load the cwclientlib config file
David Douard <david.douard@logilab.fr>
parents: 10522
diff changeset
   286
        except (ImportError, ValueError, EnvironmentError) as exc:
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   287
            # ImportError: not available
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   288
            # ValueError: no config entry found
10532
2cc74c688eb9 [datafeed] also catch EnvironmentError when trying to load the cwclientlib config file
David Douard <david.douard@logilab.fr>
parents: 10522
diff changeset
   289
            # EnvironmentError: no cwclientlib config file found
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   290
            self.source.debug(str(exc))
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   291
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   292
        # no chance with cwclientlib, fall back to former implementation
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   293
        if purl.scheme in ('http', 'https'):
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   294
            self.source.info('GET %s', url)
10610
d53b9c157f99 [py3k] import urllib2 from six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10603
diff changeset
   295
            req = Request(url)
9825
946b483bc8a1 [datafeed parser] enhance retrieve_url to support POSTing data and custom HTTP headers
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9824
diff changeset
   296
            return _OPENER.open(req, timeout=self.source.http_timeout)
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   297
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   298
        # url is probably plain content
10757
f73a9a884534 [py3k] io.BytesIO
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10662
diff changeset
   299
        return URLLibResponseAdapter(BytesIO(url.encode('ascii')), url)
9823
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   300
11251
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   301
    def process_urls(self, urls, raise_on_error=False):
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   302
        error = False
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   303
        for url in urls:
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   304
            self.info('pulling data from %s', url)
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   305
            try:
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   306
                if self.process(url, raise_on_error):
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   307
                    error = True
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   308
            except IOError as exc:
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   309
                if raise_on_error:
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   310
                    raise
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   311
                self.import_log.record_error(
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   312
                    'could not pull data while processing %s: %s'
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   313
                    % (url, exc))
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   314
                error = True
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   315
            except Exception as exc:
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   316
                if raise_on_error:
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   317
                    raise
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   318
                self.import_log.record_error(str(exc))
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   319
                self.exception('error while processing %s: %s',
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   320
                               url, exc)
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   321
                error = True
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   322
        return error
b66a8c3eebeb [datafeed] move process_urls to the parser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 11151
diff changeset
   323
8409
79534887943e [datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8408
diff changeset
   324
    def process(self, url, raise_on_error=False):
6957
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   325
        """main callback: process the url"""
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   326
        raise NotImplementedError
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   327
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   328
    def created_during_pull(self, entity):
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   329
        return entity.eid in self.stats['created']
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   330
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   331
    def updated_during_pull(self, entity):
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   332
        return entity.eid in self.stats['updated']
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   333
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   334
    def notify_updated(self, entity):
ffda12be2e9f [repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff changeset
   335
        return self.stats['updated'].add(entity.eid)
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   336
8435
5064b6e0d6f4 [datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8434
diff changeset
   337
    def notify_checked(self, entity):
5064b6e0d6f4 [datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8434
diff changeset
   338
        return self.stats['checked'].add(entity.eid)
5064b6e0d6f4 [datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8434
diff changeset
   339
8187
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   340
    def is_deleted(self, extid, etype, eid):
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   341
        """return True if the entity of given external id, entity type and eid
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   342
        is actually deleted. Always return True by default, put more sensible
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   343
        stuff in sub-classes.
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   344
        """
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   345
        return True
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   346
8188
1867e252e487 [repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8187
diff changeset
   347
    def update_if_necessary(self, entity, attrs):
1867e252e487 [repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8187
diff changeset
   348
        entity.complete(tuple(attrs))
1867e252e487 [repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8187
diff changeset
   349
        # check modification date and compare attribute values to only update
1867e252e487 [repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8187
diff changeset
   350
        # what's actually needed
8435
5064b6e0d6f4 [datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8434
diff changeset
   351
        self.notify_checked(entity)
8188
1867e252e487 [repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8187
diff changeset
   352
        mdate = attrs.get('modification_date')
1867e252e487 [repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8187
diff changeset
   353
        if not mdate or mdate > entity.modification_date:
10662
10942ed172de [py3k] dict.iteritems → dict.items
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10611
diff changeset
   354
            attrs = dict( (k, v) for k, v in attrs.items()
8188
1867e252e487 [repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8187
diff changeset
   355
                          if v != getattr(entity, k))
1867e252e487 [repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8187
diff changeset
   356
            if attrs:
8483
4ba11607d84a [entity api] unify set_attributes / set_relations into a cw_set method. Closes #2423719
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8435
diff changeset
   357
                entity.cw_set(**attrs)
8434
39c5bb4dcc59 [ldapfeed] do not crash on ldap user deletion + pull + already deactivated users, cleanups (closes #2392933)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents: 8430
diff changeset
   358
                self.notify_updated(entity)
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   359
8547
f23ac525ddd1 [datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8529
diff changeset
   360
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   361
class DataFeedXMLParser(DataFeedParser):
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   362
10914
fed8bd56f223 [repository] deprecate the extid2eid based multi-sources API
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 10805
diff changeset
   363
    @deprecated()
8409
79534887943e [datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8408
diff changeset
   364
    def process(self, url, raise_on_error=False):
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   365
        """IDataFeedParser main entry point"""
7447
d5705c9bbe82 don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7446
diff changeset
   366
        try:
d5705c9bbe82 don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7446
diff changeset
   367
            parsed = self.parse(url)
8695
358d8bed9626 [toward-py3k] rewrite to "except AnException as exc:" (part of #2711624)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents: 8694
diff changeset
   368
        except Exception as ex:
7533
43835fbdf97d [datafeed] actually raise on error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7527
diff changeset
   369
            if raise_on_error:
43835fbdf97d [datafeed] actually raise on error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7527
diff changeset
   370
                raise
8069
4341fb713b14 [datafeed log] properly log errors catched at the source level
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8068
diff changeset
   371
            self.import_log.record_error(str(ex))
7447
d5705c9bbe82 don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7446
diff changeset
   372
            return True
d5705c9bbe82 don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7446
diff changeset
   373
        for args in parsed:
11151
4259c55df3e7 merge changes from 3.22.2
Julien Cristau <julien.cristau@logilab.fr>
parents: 11138
diff changeset
   374
            self.process_item(*args, raise_on_error=raise_on_error)
4259c55df3e7 merge changes from 3.22.2
Julien Cristau <julien.cristau@logilab.fr>
parents: 11138
diff changeset
   375
        return False
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   376
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   377
    def parse(self, url):
9823
258d2f9f7d39 [datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9822
diff changeset
   378
        stream = self.retrieve_url(url)
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   379
        return self.parse_etree(etree.parse(stream).getroot())
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   380
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   381
    def parse_etree(self, document):
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   382
        return [(document,)]
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   383
10089
6346f53c85f1 [datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents: 9990
diff changeset
   384
    def process_item(self, *args, **kwargs):
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   385
        raise NotImplementedError
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   386
8187
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   387
    def is_deleted(self, extid, etype, eid):
10551
1182f5f16a3d [datafeed] fix typo in DataFeedXMLParser.is_deleted (closes #5729755)
David Douard <david.douard@logilab.fr>
parents: 10532
diff changeset
   388
        if extid.startswith('file://'):
1182f5f16a3d [datafeed] fix typo in DataFeedXMLParser.is_deleted (closes #5729755)
David Douard <david.douard@logilab.fr>
parents: 10532
diff changeset
   389
            return exists(extid[7:])
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   390
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   391
        url = self.normalize_url(extid)
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   392
        # first, try to use cwclientlib if it's available and if the
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   393
        # url matches a configuration entry in ~/.config/cwclientlibrc
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   394
        try:
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   395
            from cwclientlib import cwproxy_for
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   396
            # parse url again since it has been normalized
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   397
            cnx = cwproxy_for(url)
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   398
            cnx.timeout = self.source.http_timeout
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   399
            self.source.info('Using cwclientlib for checking %s' % url)
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   400
            return cnx.get(url).status_code == 404
10532
2cc74c688eb9 [datafeed] also catch EnvironmentError when trying to load the cwclientlib config file
David Douard <david.douard@logilab.fr>
parents: 10522
diff changeset
   401
        except (ImportError, ValueError, EnvironmentError) as exc:
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   402
            # ImportError: not available
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   403
            # ValueError: no config entry found
10532
2cc74c688eb9 [datafeed] also catch EnvironmentError when trying to load the cwclientlib config file
David Douard <david.douard@logilab.fr>
parents: 10522
diff changeset
   404
            # EnvironmentError: no cwclientlib config file found
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   405
            self.source.debug(str(exc))
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   406
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   407
        # no chance with cwclientlib, fall back to former implementation
10603
65ad6980976e [py3k] import URL mangling functions using six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10581
diff changeset
   408
        if urlparse(url).scheme in ('http', 'https'):
8187
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   409
            try:
10516
4c59409220b6 [datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents: 10143
diff changeset
   410
                _OPENER.open(url, timeout=self.source.http_timeout)
10610
d53b9c157f99 [py3k] import urllib2 from six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10603
diff changeset
   411
            except HTTPError as ex:
8187
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   412
                if ex.code == 404:
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   413
                    return True
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   414
        return False
981f6e487788 [datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 8069
diff changeset
   415
9824
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   416
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   417
class URLLibResponseAdapter(object):
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   418
    """Thin wrapper to be used to fake a value returned by urllib2.urlopen"""
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   419
    def __init__(self, stream, url, code=200):
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   420
        self._stream = stream
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   421
        self._url = url
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   422
        self.code = code
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   423
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   424
    def read(self, *args):
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   425
        return self._stream.read(*args)
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   426
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   427
    def geturl(self):
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   428
        return self._url
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   429
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   430
    def getcode(self):
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   431
        return self.code
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   432
30183ecf5c61 [datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 9823
diff changeset
   433
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   434
# use a cookie enabled opener to use session cookie if any
10610
d53b9c157f99 [py3k] import urllib2 from six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10603
diff changeset
   435
_OPENER = build_opener()
7378
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   436
try:
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   437
    from logilab.common import urllib2ext
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   438
    _OPENER.add_handler(urllib2ext.HTTPGssapiAuthHandler())
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   439
except ImportError: # python-kerberos not available
86a1ae289f05 [datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents: 7351
diff changeset
   440
    pass
10610
d53b9c157f99 [py3k] import urllib2 from six.moves
Rémi Cardona <remi.cardona@logilab.fr>
parents: 10603
diff changeset
   441
_OPENER.add_handler(HTTPCookieProcessor(CookieJar()))