author | Florent Cayré <florent.cayre@gmail.com> |
Fri, 16 Sep 2011 10:36:46 +0200 | |
changeset 7798 | 8930f7a284dd |
parent 7731 | 48e78934a4e2 |
child 7910 | e5d5609e3bf1 |
child 7921 | a93e2ed5877a |
permissions | -rw-r--r-- |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
1 |
# copyright 2010-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
2 |
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
3 |
# |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
4 |
# This file is part of CubicWeb. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
5 |
# |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
6 |
# CubicWeb is free software: you can redistribute it and/or modify it under the |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
7 |
# terms of the GNU Lesser General Public License as published by the Free |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
8 |
# Software Foundation, either version 2.1 of the License, or (at your option) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
9 |
# any later version. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
10 |
# |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
11 |
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
12 |
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
13 |
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
14 |
# details. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
15 |
# |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
16 |
# You should have received a copy of the GNU Lesser General Public License along |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
17 |
# with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
18 |
"""datafeed sources: copy data from an external data stream into the system |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
19 |
database |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
20 |
""" |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
21 |
from __future__ import with_statement |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
22 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
23 |
import urllib2 |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
24 |
import StringIO |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
25 |
from datetime import datetime, timedelta |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
26 |
from base64 import b64decode |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
27 |
from cookielib import CookieJar |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
28 |
|
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
29 |
from lxml import etree |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
30 |
|
7399
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
31 |
from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
32 |
from cubicweb.server.sources import AbstractSource |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
33 |
from cubicweb.appobject import AppObject |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
34 |
|
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
35 |
|
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
36 |
class DataFeedSource(AbstractSource): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
37 |
copy_based_source = True |
7552
82dde8276a5b
[datafeed, entities] url for entities from a datafeed source should be on their origin site. Closes #1769391
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7548
diff
changeset
|
38 |
use_cwuri_as_url = True |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
39 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
40 |
options = ( |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
41 |
('synchronize', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
42 |
{'type' : 'yn', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
43 |
'default': True, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
44 |
'help': ('Is the repository responsible to automatically import ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
45 |
'content from this source? ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
46 |
'You should say yes unless you don\'t want this behaviour ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
47 |
'or if you use a multiple repositories setup, in which ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
48 |
'case you should say yes on one repository, no on others.'), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
49 |
'group': 'datafeed-source', 'level': 2, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
50 |
}), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
51 |
('synchronization-interval', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
52 |
{'type' : 'time', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
53 |
'default': '5min', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
54 |
'help': ('Interval in seconds between synchronization with the ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
55 |
'external source (default to 5 minutes, must be >= 1 min).'), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
56 |
'group': 'datafeed-source', 'level': 2, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
57 |
}), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
58 |
('delete-entities', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
59 |
{'type' : 'yn', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
60 |
'default': True, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
61 |
'help': ('Should already imported entities not found anymore on the ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
62 |
'external source be deleted?'), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
63 |
'group': 'datafeed-source', 'level': 2, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
64 |
}), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
65 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
66 |
) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
67 |
def __init__(self, repo, source_config, eid=None): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
68 |
AbstractSource.__init__(self, repo, source_config, eid) |
7641
790038f88b8b
[datafeed] don't raise bad config error on source initialization
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7443
diff
changeset
|
69 |
self.update_config(None, self.check_conf_dict(eid, source_config, |
790038f88b8b
[datafeed] don't raise bad config error on source initialization
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7443
diff
changeset
|
70 |
fail_if_unknown=False)) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
71 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
72 |
def check_config(self, source_entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
73 |
"""check configuration of source entity""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
74 |
typedconfig = super(DataFeedSource, self).check_config(source_entity) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
75 |
if typedconfig['synchronization-interval'] < 60: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
76 |
_ = source_entity._cw._ |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
77 |
msg = _('synchronization-interval must be greater than 1 minute') |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
78 |
raise ValidationError(source_entity.eid, {'config': msg}) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
79 |
return typedconfig |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
80 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
81 |
def _entity_update(self, source_entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
82 |
source_entity.complete() |
7527
ef1e9bc38137
[datafeed] renaming parser attribute to parser_id makes things clearer
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7461
diff
changeset
|
83 |
self.parser_id = source_entity.parser |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
84 |
self.latest_retrieval = source_entity.latest_retrieval |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
85 |
self.urls = [url.strip() for url in source_entity.url.splitlines() |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
86 |
if url.strip()] |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
87 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
88 |
def update_config(self, source_entity, typedconfig): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
89 |
"""update configuration from source entity. `typedconfig` is config |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
90 |
properly typed with defaults set |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
91 |
""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
92 |
self.synchro_interval = timedelta(seconds=typedconfig['synchronization-interval']) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
93 |
if source_entity is not None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
94 |
self._entity_update(source_entity) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
95 |
self.config = typedconfig |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
96 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
97 |
def init(self, activated, source_entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
98 |
if activated: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
99 |
self._entity_update(source_entity) |
7527
ef1e9bc38137
[datafeed] renaming parser attribute to parser_id makes things clearer
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7461
diff
changeset
|
100 |
self.parser_id = source_entity.parser |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
101 |
self.load_mapping(source_entity._cw) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
102 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
103 |
def _get_parser(self, session, **kwargs): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
104 |
return self.repo.vreg['parsers'].select( |
7527
ef1e9bc38137
[datafeed] renaming parser attribute to parser_id makes things clearer
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7461
diff
changeset
|
105 |
self.parser_id, session, source=self, **kwargs) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
106 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
107 |
def load_mapping(self, session): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
108 |
self.mapping = {} |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
109 |
self.mapping_idx = {} |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
110 |
try: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
111 |
parser = self._get_parser(session) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
112 |
except (RegistryNotFound, ObjectNotFound): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
113 |
return # no parser yet, don't go further |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
114 |
self._load_mapping(session, parser=parser) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
115 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
116 |
def add_schema_config(self, schemacfg, checkonly=False, parser=None): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
117 |
"""added CWSourceSchemaConfig, modify mapping accordingly""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
118 |
if parser is None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
119 |
parser = self._get_parser(schemacfg._cw) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
120 |
parser.add_schema_config(schemacfg, checkonly) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
121 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
122 |
def del_schema_config(self, schemacfg, checkonly=False, parser=None): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
123 |
"""deleted CWSourceSchemaConfig, modify mapping accordingly""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
124 |
if parser is None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
125 |
parser = self._get_parser(schemacfg._cw) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
126 |
parser.del_schema_config(schemacfg, checkonly) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
127 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
128 |
def fresh(self): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
129 |
if self.latest_retrieval is None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
130 |
return False |
7443
c10164464afc
[datafeed] we should use utc timestamp to avoid pb with local times
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
131 |
return datetime.utcnow() < (self.latest_retrieval + self.synchro_interval) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
132 |
|
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
133 |
def update_latest_retrieval(self, session): |
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
134 |
self.latest_retrieval = datetime.utcnow() |
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
135 |
session.execute('SET X latest_retrieval %(date)s WHERE X eid %(x)s', |
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
136 |
{'x': self.eid, 'date': self.latest_retrieval}) |
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
137 |
|
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
138 |
def acquire_synchronization_lock(self, session): |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
139 |
# XXX race condition until WHERE of SET queries is executed using |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
140 |
# 'SELECT FOR UPDATE' |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
141 |
if not session.execute('SET X synchronizing TRUE WHERE X eid %(x)s, X synchronizing FALSE', |
7461
cb25b62074cc
[datafeed]Â fix stupid error introduced by 'just before qfinish' refactoring...
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7456
diff
changeset
|
142 |
{'x': self.eid}): |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
143 |
self.error('concurrent synchronization detected, skip pull') |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
144 |
session.commit(free_cnxset=False) |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
145 |
return False |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
146 |
session.commit(free_cnxset=False) |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
147 |
return True |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
148 |
|
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
149 |
def release_synchronization_lock(self, session): |
7708
45be3a9debe6
[datafeed] for datafeed source, we don't want commit in extid2eid but explicitly handled by the source. Also, we should use 'safe' internal session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7699
diff
changeset
|
150 |
session.set_cnxset() |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
151 |
session.execute('SET X synchronizing FALSE WHERE X eid %(x)s', |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
152 |
{'x': self.eid}) |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
153 |
session.commit() |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
154 |
|
7351
ed66f236715d
fix _set_relation when no target eids, update datafeed source pull_data arguments to raise on error during tests
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7001
diff
changeset
|
155 |
def pull_data(self, session, force=False, raise_on_error=False): |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
156 |
"""Launch synchronization of the source if needed. |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
157 |
|
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
158 |
This method is responsible to handle commit/rollback on the given |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
159 |
session. |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
160 |
""" |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
161 |
if not force and self.fresh(): |
6972
12aa5cd81ce5
[datafeed] return empty dict when source is fresh avoid crash in the looping task because None returned
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
6957
diff
changeset
|
162 |
return {} |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
163 |
if not self.acquire_synchronization_lock(session): |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
164 |
return {} |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
165 |
try: |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
166 |
with session.transaction(free_cnxset=False): |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
167 |
return self._pull_data(session, force, raise_on_error) |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
168 |
finally: |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
169 |
self.release_synchronization_lock(session) |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
170 |
|
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
171 |
def _pull_data(self, session, force=False, raise_on_error=False): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
172 |
if self.config['delete-entities']: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
173 |
myuris = self.source_cwuris(session) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
174 |
else: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
175 |
myuris = None |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
176 |
parser = self._get_parser(session, sourceuris=myuris) |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
177 |
if self.process_urls(parser, self.urls, raise_on_error): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
178 |
self.warning("some error occured, don't attempt to delete entities") |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
179 |
elif self.config['delete-entities'] and myuris: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
180 |
byetype = {} |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
181 |
for eid, etype in myuris.values(): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
182 |
byetype.setdefault(etype, []).append(str(eid)) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
183 |
self.error('delete %s entities %s', self.uri, byetype) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
184 |
for etype, eids in byetype.iteritems(): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
185 |
session.execute('DELETE %s X WHERE X eid IN (%s)' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
186 |
% (etype, ','.join(eids))) |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
187 |
self.update_latest_retrieval(session) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
188 |
return parser.stats |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
189 |
|
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
190 |
def process_urls(self, parser, urls, raise_on_error=False): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
191 |
error = False |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
192 |
for url in urls: |
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
193 |
self.info('pulling data from %s', url) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
194 |
try: |
7385
29f050e39b09
[datafeed] propagate raise_on_error to parser's process method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7379
diff
changeset
|
195 |
if parser.process(url, raise_on_error): |
7001
c53aa19640b2
[sobjects/parsers] on validationerror, skip entity and continue processing feed
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
6972
diff
changeset
|
196 |
error = True |
7590
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
197 |
except IOError, exc: |
7351
ed66f236715d
fix _set_relation when no target eids, update datafeed source pull_data arguments to raise on error during tests
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7001
diff
changeset
|
198 |
if raise_on_error: |
ed66f236715d
fix _set_relation when no target eids, update datafeed source pull_data arguments to raise on error during tests
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7001
diff
changeset
|
199 |
raise |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
200 |
self.error('could not pull data while processing %s: %s', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
201 |
url, exc) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
202 |
error = True |
7590
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
203 |
except Exception, exc: |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
204 |
if raise_on_error: |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
205 |
raise |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
206 |
self.exception('error while processing %s: %s', |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
207 |
url, exc) |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
208 |
error = True |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
209 |
return error |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
210 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
211 |
def before_entity_insertion(self, session, lid, etype, eid, sourceparams): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
212 |
"""called by the repository when an eid has been attributed for an |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
213 |
entity stored here but the entity has not been inserted in the system |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
214 |
table yet. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
215 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
216 |
This method must return the an Entity instance representation of this |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
217 |
entity. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
218 |
""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
219 |
entity = super(DataFeedSource, self).before_entity_insertion( |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
220 |
session, lid, etype, eid, sourceparams) |
7731
48e78934a4e2
[datafeed] properly encode/decode external uri
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7727
diff
changeset
|
221 |
entity.cw_edited['cwuri'] = lid.decode('utf-8') |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
222 |
entity.cw_edited.set_defaults() |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
223 |
sourceparams['parser'].before_entity_copy(entity, sourceparams) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
224 |
return entity |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
225 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
226 |
def after_entity_insertion(self, session, lid, entity, sourceparams): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
227 |
"""called by the repository after an entity stored here has been |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
228 |
inserted in the system table. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
229 |
""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
230 |
if session.is_hook_category_activated('integrity'): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
231 |
entity.cw_edited.check(creation=True) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
232 |
self.repo.system_source.add_entity(session, entity) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
233 |
entity.cw_edited.saved = entity._cw_is_saved = True |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
234 |
sourceparams['parser'].after_entity_copy(entity, sourceparams) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
235 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
236 |
def source_cwuris(self, session): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
237 |
sql = ('SELECT extid, eid, type FROM entities, cw_source_relation ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
238 |
'WHERE entities.eid=cw_source_relation.eid_from ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
239 |
'AND cw_source_relation.eid_to=%s' % self.eid) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
240 |
return dict((b64decode(uri), (eid, type)) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
241 |
for uri, eid, type in session.system_sql(sql)) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
242 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
243 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
244 |
class DataFeedParser(AppObject): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
245 |
__registry__ = 'parsers' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
246 |
|
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
247 |
def __init__(self, session, source, sourceuris=None, **kwargs): |
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
248 |
super(DataFeedParser, self).__init__(session, **kwargs) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
249 |
self.source = source |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
250 |
self.sourceuris = sourceuris |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
251 |
self.stats = {'created': set(), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
252 |
'updated': set()} |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
253 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
254 |
def add_schema_config(self, schemacfg, checkonly=False): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
255 |
"""added CWSourceSchemaConfig, modify mapping accordingly""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
256 |
msg = schemacfg._cw._("this parser doesn't use a mapping") |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
257 |
raise ValidationError(schemacfg.eid, {None: msg}) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
258 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
259 |
def del_schema_config(self, schemacfg, checkonly=False): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
260 |
"""deleted CWSourceSchemaConfig, modify mapping accordingly""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
261 |
msg = schemacfg._cw._("this parser doesn't use a mapping") |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
262 |
raise ValidationError(schemacfg.eid, {None: msg}) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
263 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
264 |
def extid2entity(self, uri, etype, **sourceparams): |
7399
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
265 |
"""return an entity for the given uri. May return None if it should be |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
266 |
skipped |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
267 |
""" |
7697
ef50074a0314
[repo api] deprecates source.extid2eid, only the system source should implement it,
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7642
diff
changeset
|
268 |
session = self._cw |
7534
d58a9d96aad8
[datafeed, cw.xml] xml now carry entity's source information, interpreted at the other end so that for instance when an entity from elo is seen when importing cwo, it's properly marked as coming from elo source if one exists
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7533
diff
changeset
|
269 |
# if cwsource is specified and repository has a source with the same |
d58a9d96aad8
[datafeed, cw.xml] xml now carry entity's source information, interpreted at the other end so that for instance when an entity from elo is seen when importing cwo, it's properly marked as coming from elo source if one exists
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7533
diff
changeset
|
270 |
# name, call extid2eid on that source so entity will be properly seen as |
d58a9d96aad8
[datafeed, cw.xml] xml now carry entity's source information, interpreted at the other end so that for instance when an entity from elo is seen when importing cwo, it's properly marked as coming from elo source if one exists
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7533
diff
changeset
|
271 |
# coming from this source |
7699
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
272 |
source_uri = sourceparams.pop('cwsource', None) |
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
273 |
if source_uri is not None and source_uri != 'system': |
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
274 |
source = session.repo.sources_by_uri.get(source_uri, self.source) |
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
275 |
else: |
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
276 |
source = self.source |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
277 |
sourceparams['parser'] = self |
7731
48e78934a4e2
[datafeed] properly encode/decode external uri
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7727
diff
changeset
|
278 |
if isinstance(uri, unicode): |
48e78934a4e2
[datafeed] properly encode/decode external uri
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7727
diff
changeset
|
279 |
uri = uri.encode('utf-8') |
7590
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
280 |
try: |
7697
ef50074a0314
[repo api] deprecates source.extid2eid, only the system source should implement it,
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7642
diff
changeset
|
281 |
eid = session.repo.extid2eid(source, str(uri), etype, session, |
7708
45be3a9debe6
[datafeed] for datafeed source, we don't want commit in extid2eid but explicitly handled by the source. Also, we should use 'safe' internal session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7699
diff
changeset
|
282 |
complete=False, commit=False, |
7698
1c7411535c2d
[datafeed / fti] rather control a 'complete' parameter than setting empty attribute values
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7697
diff
changeset
|
283 |
sourceparams=sourceparams) |
7590
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
284 |
except ValidationError, ex: |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
285 |
self.source.error('error while creating %s: %s', etype, ex) |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
286 |
return None |
7399
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
287 |
if eid < 0: |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
288 |
# entity has been moved away from its original source |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
289 |
# |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
290 |
# Don't give etype to entity_from_eid so we get UnknownEid if the |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
291 |
# entity has been removed |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
292 |
try: |
7697
ef50074a0314
[repo api] deprecates source.extid2eid, only the system source should implement it,
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7642
diff
changeset
|
293 |
entity = session.entity_from_eid(-eid) |
7399
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
294 |
except UnknownEid: |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
295 |
return None |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
296 |
self.notify_updated(entity) # avoid later update from the source's data |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
297 |
return entity |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
298 |
if self.sourceuris is not None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
299 |
self.sourceuris.pop(str(uri), None) |
7697
ef50074a0314
[repo api] deprecates source.extid2eid, only the system source should implement it,
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7642
diff
changeset
|
300 |
return session.entity_from_eid(eid, etype) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
301 |
|
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
302 |
def process(self, url, partialcommit=True): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
303 |
"""main callback: process the url""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
304 |
raise NotImplementedError |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
305 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
306 |
def before_entity_copy(self, entity, sourceparams): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
307 |
raise NotImplementedError |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
308 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
309 |
def after_entity_copy(self, entity, sourceparams): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
310 |
self.stats['created'].add(entity.eid) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
311 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
312 |
def created_during_pull(self, entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
313 |
return entity.eid in self.stats['created'] |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
314 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
315 |
def updated_during_pull(self, entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
316 |
return entity.eid in self.stats['updated'] |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
317 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
318 |
def notify_updated(self, entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
319 |
return self.stats['updated'].add(entity.eid) |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
320 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
321 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
322 |
class DataFeedXMLParser(DataFeedParser): |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
323 |
|
7385
29f050e39b09
[datafeed] propagate raise_on_error to parser's process method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7379
diff
changeset
|
324 |
def process(self, url, raise_on_error=False, partialcommit=True): |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
325 |
"""IDataFeedParser main entry point""" |
7447
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
326 |
try: |
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
327 |
parsed = self.parse(url) |
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
328 |
except Exception, ex: |
7533
43835fbdf97d
[datafeed] actually raise on error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7527
diff
changeset
|
329 |
if raise_on_error: |
43835fbdf97d
[datafeed] actually raise on error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7527
diff
changeset
|
330 |
raise |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
331 |
self.source.error(str(ex)) |
7447
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
332 |
return True |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
333 |
error = False |
7447
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
334 |
for args in parsed: |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
335 |
try: |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
336 |
self.process_item(*args) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
337 |
if partialcommit: |
7398
26695dd703d8
[repository api] definitly kill usage of word 'pool' to refer to connections set used by a session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7385
diff
changeset
|
338 |
# commit+set_cnxset instead of commit(free_cnxset=False) to let |
26695dd703d8
[repository api] definitly kill usage of word 'pool' to refer to connections set used by a session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7385
diff
changeset
|
339 |
# other a chance to get our connections set |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
340 |
self._cw.commit() |
7398
26695dd703d8
[repository api] definitly kill usage of word 'pool' to refer to connections set used by a session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7385
diff
changeset
|
341 |
self._cw.set_cnxset() |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
342 |
except ValidationError, exc: |
7385
29f050e39b09
[datafeed] propagate raise_on_error to parser's process method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7379
diff
changeset
|
343 |
if raise_on_error: |
29f050e39b09
[datafeed] propagate raise_on_error to parser's process method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7379
diff
changeset
|
344 |
raise |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
345 |
if partialcommit: |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
346 |
self.source.error('Skipping %s because of validation error %s' % (args, exc)) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
347 |
self._cw.rollback() |
7398
26695dd703d8
[repository api] definitly kill usage of word 'pool' to refer to connections set used by a session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7385
diff
changeset
|
348 |
self._cw.set_cnxset() |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
349 |
error = True |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
350 |
else: |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
351 |
raise |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
352 |
return error |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
353 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
354 |
def parse(self, url): |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
355 |
if url.startswith('http'): |
7727
70ea754d3e04
[server] host mapping is better named url mapping (closes #1892461)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
7708
diff
changeset
|
356 |
from cubicweb.sobjects.parsers import URL_MAPPING |
70ea754d3e04
[server] host mapping is better named url mapping (closes #1892461)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
7708
diff
changeset
|
357 |
for mappedurl in URL_MAPPING: |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
358 |
if url.startswith(mappedurl): |
7727
70ea754d3e04
[server] host mapping is better named url mapping (closes #1892461)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
7708
diff
changeset
|
359 |
url = url.replace(mappedurl, URL_MAPPING[mappedurl], 1) |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
360 |
break |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
361 |
self.source.info('GET %s', url) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
362 |
stream = _OPENER.open(url) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
363 |
elif url.startswith('file://'): |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
364 |
stream = open(url[7:]) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
365 |
else: |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
366 |
stream = StringIO.StringIO(url) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
367 |
return self.parse_etree(etree.parse(stream).getroot()) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
368 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
369 |
def parse_etree(self, document): |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
370 |
return [(document,)] |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
371 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
372 |
def process_item(self, *args): |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
373 |
raise NotImplementedError |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
374 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
375 |
# use a cookie enabled opener to use session cookie if any |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
376 |
_OPENER = urllib2.build_opener() |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
377 |
try: |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
378 |
from logilab.common import urllib2ext |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
379 |
_OPENER.add_handler(urllib2ext.HTTPGssapiAuthHandler()) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
380 |
except ImportError: # python-kerberos not available |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
381 |
pass |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
382 |
_OPENER.add_handler(urllib2.HTTPCookieProcessor(CookieJar())) |