author | Samuel Trégouët <samuel.tregouet@logilab.fr> |
Fri, 11 Sep 2015 15:52:18 +0200 | |
changeset 10592 | dfa1dcf4d7f1 |
parent 10581 | 7846d26ff91d |
child 10603 | 65ad6980976e |
permissions | -rw-r--r-- |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
1 |
# copyright 2010-2014 LOGILAB S.A. (Paris, FRANCE), all rights reserved. |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
2 |
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
3 |
# |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
4 |
# This file is part of CubicWeb. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
5 |
# |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
6 |
# CubicWeb is free software: you can redistribute it and/or modify it under the |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
7 |
# terms of the GNU Lesser General Public License as published by the Free |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
8 |
# Software Foundation, either version 2.1 of the License, or (at your option) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
9 |
# any later version. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
10 |
# |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
11 |
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
12 |
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
13 |
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
14 |
# details. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
15 |
# |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
16 |
# You should have received a copy of the GNU Lesser General Public License along |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
17 |
# with CubicWeb. If not, see <http://www.gnu.org/licenses/>. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
18 |
"""datafeed sources: copy data from an external data stream into the system |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
19 |
database |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
20 |
""" |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
21 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
22 |
import urllib2 |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
23 |
import StringIO |
8187
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
24 |
from os.path import exists |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
25 |
from datetime import datetime, timedelta |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
26 |
from cookielib import CookieJar |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
27 |
import urlparse |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
28 |
from lxml import etree |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
29 |
|
7399
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
30 |
from cubicweb import RegistryNotFound, ObjectNotFound, ValidationError, UnknownEid |
8547
f23ac525ddd1
[datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8529
diff
changeset
|
31 |
from cubicweb.server.repository import preprocess_inlined_relations |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
32 |
from cubicweb.server.sources import AbstractSource |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
33 |
from cubicweb.appobject import AppObject |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
34 |
|
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
35 |
|
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
36 |
class DataFeedSource(AbstractSource): |
7552
82dde8276a5b
[datafeed, entities] url for entities from a datafeed source should be on their origin site. Closes #1769391
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7548
diff
changeset
|
37 |
use_cwuri_as_url = True |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
38 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
39 |
options = ( |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
40 |
('synchronize', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
41 |
{'type' : 'yn', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
42 |
'default': True, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
43 |
'help': ('Is the repository responsible to automatically import ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
44 |
'content from this source? ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
45 |
'You should say yes unless you don\'t want this behaviour ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
46 |
'or if you use a multiple repositories setup, in which ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
47 |
'case you should say yes on one repository, no on others.'), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
48 |
'group': 'datafeed-source', 'level': 2, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
49 |
}), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
50 |
('synchronization-interval', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
51 |
{'type' : 'time', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
52 |
'default': '5min', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
53 |
'help': ('Interval in seconds between synchronization with the ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
54 |
'external source (default to 5 minutes, must be >= 1 min).'), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
55 |
'group': 'datafeed-source', 'level': 2, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
56 |
}), |
7921
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
57 |
('max-lock-lifetime', |
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
58 |
{'type' : 'time', |
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
59 |
'default': '1h', |
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
60 |
'help': ('Maximum time allowed for a synchronization to be run. ' |
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
61 |
'Exceeded that time, the synchronization will be considered ' |
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
62 |
'as having failed and not properly released the lock, hence ' |
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
63 |
'it won\'t be considered'), |
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
64 |
'group': 'datafeed-source', 'level': 2, |
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
65 |
}), |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
66 |
('delete-entities', |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
67 |
{'type' : 'yn', |
8430
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
68 |
'default': False, |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
69 |
'help': ('Should already imported entities not found anymore on the ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
70 |
'external source be deleted?'), |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
71 |
'group': 'datafeed-source', 'level': 2, |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
72 |
}), |
7995
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
73 |
('logs-lifetime', |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
74 |
{'type': 'time', |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
75 |
'default': '10d', |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
76 |
'help': ('Time before logs from datafeed imports are deleted.'), |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
77 |
'group': 'datafeed-source', 'level': 2, |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
78 |
}), |
9182
75493f6ca586
[datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents:
8695
diff
changeset
|
79 |
('http-timeout', |
75493f6ca586
[datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents:
8695
diff
changeset
|
80 |
{'type': 'time', |
75493f6ca586
[datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents:
8695
diff
changeset
|
81 |
'default': '1min', |
75493f6ca586
[datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents:
8695
diff
changeset
|
82 |
'help': ('Timeout of HTTP GET requests, when synchronizing a source.'), |
75493f6ca586
[datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents:
8695
diff
changeset
|
83 |
'group': 'datafeed-source', 'level': 2, |
75493f6ca586
[datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents:
8695
diff
changeset
|
84 |
}), |
9822
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
85 |
('use-cwuri-as-url', |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
86 |
{'type': 'yn', |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
87 |
'default': None, # explicitly unset |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
88 |
'help': ('Use cwuri (i.e. external URL) for link to the entity ' |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
89 |
'instead of its local URL.'), |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
90 |
'group': 'datafeed-source', 'level': 1, |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
91 |
}), |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
92 |
) |
8430
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
93 |
|
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
94 |
def check_config(self, source_entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
95 |
"""check configuration of source entity""" |
8674
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
96 |
typed_config = super(DataFeedSource, self).check_config(source_entity) |
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
97 |
if typed_config['synchronization-interval'] < 60: |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
98 |
_ = source_entity._cw._ |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
99 |
msg = _('synchronization-interval must be greater than 1 minute') |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
100 |
raise ValidationError(source_entity.eid, {'config': msg}) |
8674
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
101 |
return typed_config |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
102 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
103 |
def _entity_update(self, source_entity): |
8674
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
104 |
super(DataFeedSource, self)._entity_update(source_entity) |
7527
ef1e9bc38137
[datafeed] renaming parser attribute to parser_id makes things clearer
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7461
diff
changeset
|
105 |
self.parser_id = source_entity.parser |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
106 |
self.latest_retrieval = source_entity.latest_retrieval |
8188
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
107 |
|
8674
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
108 |
def update_config(self, source_entity, typed_config): |
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
109 |
"""update configuration from source entity. `typed_config` is config |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
110 |
properly typed with defaults set |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
111 |
""" |
8674
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
112 |
super(DataFeedSource, self).update_config(source_entity, typed_config) |
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
113 |
self.synchro_interval = timedelta(seconds=typed_config['synchronization-interval']) |
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
114 |
self.max_lock_lifetime = timedelta(seconds=typed_config['max-lock-lifetime']) |
9182
75493f6ca586
[datafeed] add a timeout config option (closes #2745677)
David Douard <david.douard@logilab.fr>
parents:
8695
diff
changeset
|
115 |
self.http_timeout = typed_config['http-timeout'] |
9822
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
116 |
# if typed_config['use-cwuri-as-url'] is set, we have to update |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
117 |
# use_cwuri_as_url attribute and public configuration dictionary |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
118 |
# accordingly |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
119 |
if typed_config['use-cwuri-as-url'] is not None: |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
120 |
self.use_cwuri_as_url = typed_config['use-cwuri-as-url'] |
4a118bfd6ab4
[datafeed] Allow to override use_cwuri_as_url in configuration of a datafeed source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9746
diff
changeset
|
121 |
self.public_config['use-cwuri-as-url'] = self.use_cwuri_as_url |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
122 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
123 |
def init(self, activated, source_entity): |
8674
001c1592060a
[repo sources] move handling of source's url into abstract source as this becomes shared by most sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8573
diff
changeset
|
124 |
super(DataFeedSource, self).init(activated, source_entity) |
7527
ef1e9bc38137
[datafeed] renaming parser attribute to parser_id makes things clearer
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7461
diff
changeset
|
125 |
self.parser_id = source_entity.parser |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
126 |
self.load_mapping(source_entity._cw) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
127 |
|
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
128 |
def _get_parser(self, cnx, **kwargs): |
10454
20f45a9b385c
[datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents:
10143
diff
changeset
|
129 |
if self.parser_id is None: |
20f45a9b385c
[datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents:
10143
diff
changeset
|
130 |
self.warning('No parser defined on source %r', self) |
20f45a9b385c
[datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents:
10143
diff
changeset
|
131 |
raise ObjectNotFound() |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
132 |
return self.repo.vreg['parsers'].select( |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
133 |
self.parser_id, cnx, source=self, **kwargs) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
134 |
|
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
135 |
def load_mapping(self, cnx): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
136 |
self.mapping = {} |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
137 |
self.mapping_idx = {} |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
138 |
try: |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
139 |
parser = self._get_parser(cnx) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
140 |
except (RegistryNotFound, ObjectNotFound): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
141 |
return # no parser yet, don't go further |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
142 |
self._load_mapping(cnx, parser=parser) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
143 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
144 |
def add_schema_config(self, schemacfg, checkonly=False, parser=None): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
145 |
"""added CWSourceSchemaConfig, modify mapping accordingly""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
146 |
if parser is None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
147 |
parser = self._get_parser(schemacfg._cw) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
148 |
parser.add_schema_config(schemacfg, checkonly) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
149 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
150 |
def del_schema_config(self, schemacfg, checkonly=False, parser=None): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
151 |
"""deleted CWSourceSchemaConfig, modify mapping accordingly""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
152 |
if parser is None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
153 |
parser = self._get_parser(schemacfg._cw) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
154 |
parser.del_schema_config(schemacfg, checkonly) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
155 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
156 |
def fresh(self): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
157 |
if self.latest_retrieval is None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
158 |
return False |
7443
c10164464afc
[datafeed] we should use utc timestamp to avoid pb with local times
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
159 |
return datetime.utcnow() < (self.latest_retrieval + self.synchro_interval) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
160 |
|
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
161 |
def update_latest_retrieval(self, cnx): |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
162 |
self.latest_retrieval = datetime.utcnow() |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
163 |
cnx.execute('SET X latest_retrieval %(date)s WHERE X eid %(x)s', |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
164 |
{'x': self.eid, 'date': self.latest_retrieval}) |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
165 |
cnx.commit() |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
166 |
|
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
167 |
def acquire_synchronization_lock(self, cnx): |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
168 |
# XXX race condition until WHERE of SET queries is executed using |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
169 |
# 'SELECT FOR UPDATE' |
7921
a93e2ed5877a
[datafeed] add max-lifetime for concurrent synchronization lock (closes #1908676)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7731
diff
changeset
|
170 |
now = datetime.utcnow() |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
171 |
if not cnx.execute( |
7933
b25dda2214a2
[datafeed] remove remaining uses of 'synchronizing' attribute replaced by 'in_synchronization' in 3.13.8. Closes #1989131
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7921
diff
changeset
|
172 |
'SET X in_synchronization %(now)s WHERE X eid %(x)s, ' |
b25dda2214a2
[datafeed] remove remaining uses of 'synchronizing' attribute replaced by 'in_synchronization' in 3.13.8. Closes #1989131
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7921
diff
changeset
|
173 |
'X in_synchronization NULL OR X in_synchronization < %(maxdt)s', |
b25dda2214a2
[datafeed] remove remaining uses of 'synchronizing' attribute replaced by 'in_synchronization' in 3.13.8. Closes #1989131
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7921
diff
changeset
|
174 |
{'x': self.eid, 'now': now, 'maxdt': now - self.max_lock_lifetime}): |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
175 |
self.error('concurrent synchronization detected, skip pull') |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
176 |
cnx.commit() |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
177 |
return False |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
178 |
cnx.commit() |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
179 |
return True |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
180 |
|
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
181 |
def release_synchronization_lock(self, cnx): |
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
182 |
cnx.execute('SET X in_synchronization NULL WHERE X eid %(x)s', |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
183 |
{'x': self.eid}) |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
184 |
cnx.commit() |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
185 |
|
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
186 |
def pull_data(self, cnx, force=False, raise_on_error=False): |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
187 |
"""Launch synchronization of the source if needed. |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
188 |
|
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
189 |
This method is responsible to handle commit/rollback on the given |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
190 |
connection. |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
191 |
""" |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
192 |
if not force and self.fresh(): |
6972
12aa5cd81ce5
[datafeed] return empty dict when source is fresh avoid crash in the looping task because None returned
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
6957
diff
changeset
|
193 |
return {} |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
194 |
if not self.acquire_synchronization_lock(cnx): |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
195 |
return {} |
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
196 |
try: |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
197 |
return self._pull_data(cnx, force, raise_on_error) |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
198 |
finally: |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
199 |
cnx.rollback() # rollback first in case there is some dirty |
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
200 |
# transaction remaining |
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
201 |
self.release_synchronization_lock(cnx) |
7456
c54038622fc9
[datafeed] use a boolean flag on CWSource to ensure we don't have concurrent synchronizations. Closes #1725690
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7447
diff
changeset
|
202 |
|
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
203 |
def _pull_data(self, cnx, force=False, raise_on_error=False): |
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
204 |
importlog = self.init_import_log(cnx) |
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
205 |
myuris = self.source_cwuris(cnx) |
10454
20f45a9b385c
[datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents:
10143
diff
changeset
|
206 |
try: |
20f45a9b385c
[datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents:
10143
diff
changeset
|
207 |
parser = self._get_parser(cnx, sourceuris=myuris, import_log=importlog) |
20f45a9b385c
[datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents:
10143
diff
changeset
|
208 |
except ObjectNotFound: |
20f45a9b385c
[datafeed] give an error message if a source is missing a parser id
Julien Cristau <julien.cristau@logilab.fr>
parents:
10143
diff
changeset
|
209 |
return {} |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
210 |
if self.process_urls(parser, self.urls, raise_on_error): |
9517
3338b2205ea3
Typo in comments and error messages
Dimitri Papadopoulos <dimitri.papadopoulos@cea.fr>
parents:
9224
diff
changeset
|
211 |
self.warning("some error occurred, don't attempt to delete entities") |
8430
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
212 |
else: |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
213 |
parser.handle_deletion(self.config, cnx, myuris) |
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
214 |
self.update_latest_retrieval(cnx) |
7995
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
215 |
stats = parser.stats |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
216 |
if stats.get('created'): |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
217 |
importlog.record_info('added %s entities' % len(stats['created'])) |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
218 |
if stats.get('updated'): |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
219 |
importlog.record_info('updated %s entities' % len(stats['updated'])) |
9746
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
220 |
importlog.write_log(cnx, end_timestamp=self.latest_retrieval) |
81b56897a377
[datafeed] update datafeed internals to use connection instead of session
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9665
diff
changeset
|
221 |
cnx.commit() |
7995
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
222 |
return stats |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
223 |
|
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
224 |
def process_urls(self, parser, urls, raise_on_error=False): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
225 |
error = False |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
226 |
for url in urls: |
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
227 |
self.info('pulling data from %s', url) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
228 |
try: |
7385
29f050e39b09
[datafeed] propagate raise_on_error to parser's process method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7379
diff
changeset
|
229 |
if parser.process(url, raise_on_error): |
7001
c53aa19640b2
[sobjects/parsers] on validationerror, skip entity and continue processing feed
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
6972
diff
changeset
|
230 |
error = True |
8695
358d8bed9626
[toward-py3k] rewrite to "except AnException as exc:" (part of #2711624)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
8694
diff
changeset
|
231 |
except IOError as exc: |
7351
ed66f236715d
fix _set_relation when no target eids, update datafeed source pull_data arguments to raise on error during tests
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7001
diff
changeset
|
232 |
if raise_on_error: |
ed66f236715d
fix _set_relation when no target eids, update datafeed source pull_data arguments to raise on error during tests
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7001
diff
changeset
|
233 |
raise |
8069
4341fb713b14
[datafeed log]Â properly log errors catched at the source level
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8068
diff
changeset
|
234 |
parser.import_log.record_error( |
4341fb713b14
[datafeed log]Â properly log errors catched at the source level
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8068
diff
changeset
|
235 |
'could not pull data while processing %s: %s' |
4341fb713b14
[datafeed log]Â properly log errors catched at the source level
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8068
diff
changeset
|
236 |
% (url, exc)) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
237 |
error = True |
8695
358d8bed9626
[toward-py3k] rewrite to "except AnException as exc:" (part of #2711624)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
8694
diff
changeset
|
238 |
except Exception as exc: |
7590
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
239 |
if raise_on_error: |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
240 |
raise |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
241 |
self.exception('error while processing %s: %s', |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
242 |
url, exc) |
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
243 |
error = True |
7446
6fba86efdd09
[datafeed] extract some methods from pull_data to ease writing custom datafeed sources
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7444
diff
changeset
|
244 |
return error |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
245 |
|
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
246 |
def before_entity_insertion(self, cnx, lid, etype, eid, sourceparams): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
247 |
"""called by the repository when an eid has been attributed for an |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
248 |
entity stored here but the entity has not been inserted in the system |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
249 |
table yet. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
250 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
251 |
This method must return the an Entity instance representation of this |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
252 |
entity. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
253 |
""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
254 |
entity = super(DataFeedSource, self).before_entity_insertion( |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
255 |
cnx, lid, etype, eid, sourceparams) |
7731
48e78934a4e2
[datafeed] properly encode/decode external uri
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7727
diff
changeset
|
256 |
entity.cw_edited['cwuri'] = lid.decode('utf-8') |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
257 |
entity.cw_edited.set_defaults() |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
258 |
sourceparams['parser'].before_entity_copy(entity, sourceparams) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
259 |
return entity |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
260 |
|
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
261 |
def after_entity_insertion(self, cnx, lid, entity, sourceparams): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
262 |
"""called by the repository after an entity stored here has been |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
263 |
inserted in the system table. |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
264 |
""" |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
265 |
relations = preprocess_inlined_relations(cnx, entity) |
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
266 |
if cnx.is_hook_category_activated('integrity'): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
267 |
entity.cw_edited.check(creation=True) |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
268 |
self.repo.system_source.add_entity(cnx, entity) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
269 |
entity.cw_edited.saved = entity._cw_is_saved = True |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
270 |
sourceparams['parser'].after_entity_copy(entity, sourceparams) |
8547
f23ac525ddd1
[datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8529
diff
changeset
|
271 |
# call hooks for inlined relations |
f23ac525ddd1
[datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8529
diff
changeset
|
272 |
call_hooks = self.repo.hm.call_hooks |
f23ac525ddd1
[datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8529
diff
changeset
|
273 |
if self.should_call_hooks: |
f23ac525ddd1
[datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8529
diff
changeset
|
274 |
for attr, value in relations: |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
275 |
call_hooks('before_add_relation', cnx, |
8547
f23ac525ddd1
[datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8529
diff
changeset
|
276 |
eidfrom=entity.eid, rtype=attr, eidto=value) |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
277 |
call_hooks('after_add_relation', cnx, |
8547
f23ac525ddd1
[datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8529
diff
changeset
|
278 |
eidfrom=entity.eid, rtype=attr, eidto=value) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
279 |
|
9860
e24bf60428d3
[tests/datafeed] use the new connection api (a small leftover)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9746
diff
changeset
|
280 |
def source_cwuris(self, cnx): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
281 |
sql = ('SELECT extid, eid, type FROM entities, cw_source_relation ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
282 |
'WHERE entities.eid=cw_source_relation.eid_from ' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
283 |
'AND cw_source_relation.eid_to=%s' % self.eid) |
10581
7846d26ff91d
[server/sources] use decode_extid in datafeed
Julien Cristau <julien.cristau@logilab.fr>
parents:
10551
diff
changeset
|
284 |
return dict((self.decode_extid(uri), (eid, type)) |
9860
e24bf60428d3
[tests/datafeed] use the new connection api (a small leftover)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9746
diff
changeset
|
285 |
for uri, eid, type in cnx.system_sql(sql).fetchall()) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
286 |
|
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
287 |
def init_import_log(self, cnx, **kwargs): |
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
288 |
dataimport = cnx.create_entity('CWDataImport', cw_import_of=self, |
7995
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
289 |
start_timestamp=datetime.utcnow(), |
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
290 |
**kwargs) |
8068
72210779ff6d
[data import log] log on the source so we properly have source name information
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7995
diff
changeset
|
291 |
dataimport.init() |
7995
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
292 |
return dataimport |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
293 |
|
8187
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
294 |
|
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
295 |
class DataFeedParser(AppObject): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
296 |
__registry__ = 'parsers' |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
297 |
|
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
298 |
def __init__(self, cnx, source, sourceuris=None, import_log=None, **kwargs): |
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
299 |
super(DataFeedParser, self).__init__(cnx, **kwargs) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
300 |
self.source = source |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
301 |
self.sourceuris = sourceuris |
7995
9a9f35ef418c
Record a log of datafeed source imports (closes #2026097)
Julien Cristau <julien.cristau@logilab.fr>
parents:
7950
diff
changeset
|
302 |
self.import_log = import_log |
8435
5064b6e0d6f4
[datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8434
diff
changeset
|
303 |
self.stats = {'created': set(), 'updated': set(), 'checked': set()} |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
304 |
|
8187
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
305 |
def normalize_url(self, url): |
9823
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
306 |
"""Normalize an url by looking if there is a replacement for it in |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
307 |
`cubicweb.sobjects.URL_MAPPING`. |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
308 |
|
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
309 |
This dictionary allow to redirect from one host to another, which may be |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
310 |
useful for example in case of test instance using production data, while |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
311 |
you don't want to load the external source nor to hack your `/etc/hosts` |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
312 |
file. |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
313 |
""" |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
314 |
# local import mandatory, it's available after registration |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
315 |
from cubicweb.sobjects import URL_MAPPING |
8187
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
316 |
for mappedurl in URL_MAPPING: |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
317 |
if url.startswith(mappedurl): |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
318 |
return url.replace(mappedurl, URL_MAPPING[mappedurl], 1) |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
319 |
return url |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
320 |
|
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
321 |
def retrieve_url(self, url): |
9823
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
322 |
"""Return stream linked by the given url: |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
323 |
* HTTP urls will be normalized (see :meth:`normalize_url`) |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
324 |
* handle file:// URL |
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
325 |
* other will be considered as plain content, useful for testing purpose |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
326 |
|
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
327 |
For http URLs, it will try to find a cwclientlib config entry |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
328 |
(if available) and use it as requester. |
9823
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
329 |
""" |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
330 |
purl = urlparse.urlparse(url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
331 |
if purl.scheme == 'file': |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
332 |
return URLLibResponseAdapter(open(url[7:]), url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
333 |
|
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
334 |
url = self.normalize_url(url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
335 |
|
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
336 |
# first, try to use cwclientlib if it's available and if the |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
337 |
# url matches a configuration entry in ~/.config/cwclientlibrc |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
338 |
try: |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
339 |
from cwclientlib import cwproxy_for |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
340 |
# parse url again since it has been normalized |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
341 |
cnx = cwproxy_for(url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
342 |
cnx.timeout = self.source.http_timeout |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
343 |
self.source.info('Using cwclientlib for %s' % url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
344 |
resp = cnx.get(url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
345 |
resp.raise_for_status() |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
346 |
return URLLibResponseAdapter(StringIO.StringIO(resp.text), url) |
10532
2cc74c688eb9
[datafeed] also catch EnvironmentError when trying to load the cwclientlib config file
David Douard <david.douard@logilab.fr>
parents:
10522
diff
changeset
|
347 |
except (ImportError, ValueError, EnvironmentError) as exc: |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
348 |
# ImportError: not available |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
349 |
# ValueError: no config entry found |
10532
2cc74c688eb9
[datafeed] also catch EnvironmentError when trying to load the cwclientlib config file
David Douard <david.douard@logilab.fr>
parents:
10522
diff
changeset
|
350 |
# EnvironmentError: no cwclientlib config file found |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
351 |
self.source.debug(str(exc)) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
352 |
|
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
353 |
# no chance with cwclientlib, fall back to former implementation |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
354 |
if purl.scheme in ('http', 'https'): |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
355 |
self.source.info('GET %s', url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
356 |
req = urllib2.Request(url) |
9825
946b483bc8a1
[datafeed parser] enhance retrieve_url to support POSTing data and custom HTTP headers
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9824
diff
changeset
|
357 |
return _OPENER.open(req, timeout=self.source.http_timeout) |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
358 |
|
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
359 |
# url is probably plain content |
9824
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
360 |
return URLLibResponseAdapter(StringIO.StringIO(url), url) |
9823
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
361 |
|
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
362 |
def add_schema_config(self, schemacfg, checkonly=False): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
363 |
"""added CWSourceSchemaConfig, modify mapping accordingly""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
364 |
msg = schemacfg._cw._("this parser doesn't use a mapping") |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
365 |
raise ValidationError(schemacfg.eid, {None: msg}) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
366 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
367 |
def del_schema_config(self, schemacfg, checkonly=False): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
368 |
"""deleted CWSourceSchemaConfig, modify mapping accordingly""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
369 |
msg = schemacfg._cw._("this parser doesn't use a mapping") |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
370 |
raise ValidationError(schemacfg.eid, {None: msg}) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
371 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
372 |
def extid2entity(self, uri, etype, **sourceparams): |
10089
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
373 |
"""Return an entity for the given uri. May return None if it should be |
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
374 |
skipped. |
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
375 |
|
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
376 |
If a `raise_on_error` keyword parameter is passed, a ValidationError |
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
377 |
exception may be raised. |
7399
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
378 |
""" |
10089
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
379 |
raise_on_error = sourceparams.pop('raise_on_error', False) |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
380 |
cnx = self._cw |
7534
d58a9d96aad8
[datafeed, cw.xml] xml now carry entity's source information, interpreted at the other end so that for instance when an entity from elo is seen when importing cwo, it's properly marked as coming from elo source if one exists
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7533
diff
changeset
|
381 |
# if cwsource is specified and repository has a source with the same |
d58a9d96aad8
[datafeed, cw.xml] xml now carry entity's source information, interpreted at the other end so that for instance when an entity from elo is seen when importing cwo, it's properly marked as coming from elo source if one exists
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7533
diff
changeset
|
382 |
# name, call extid2eid on that source so entity will be properly seen as |
d58a9d96aad8
[datafeed, cw.xml] xml now carry entity's source information, interpreted at the other end so that for instance when an entity from elo is seen when importing cwo, it's properly marked as coming from elo source if one exists
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7533
diff
changeset
|
383 |
# coming from this source |
7699
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
384 |
source_uri = sourceparams.pop('cwsource', None) |
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
385 |
if source_uri is not None and source_uri != 'system': |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
386 |
source = cnx.repo.sources_by_uri.get(source_uri, self.source) |
7699
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
387 |
else: |
d07cde311630
[datafeed] properly take care to cwsource=system in imported xml. Closes #1877017
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7698
diff
changeset
|
388 |
source = self.source |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
389 |
sourceparams['parser'] = self |
7731
48e78934a4e2
[datafeed] properly encode/decode external uri
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7727
diff
changeset
|
390 |
if isinstance(uri, unicode): |
48e78934a4e2
[datafeed] properly encode/decode external uri
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7727
diff
changeset
|
391 |
uri = uri.encode('utf-8') |
7590
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
392 |
try: |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
393 |
eid = cnx.repo.extid2eid(source, str(uri), etype, cnx, |
7698
1c7411535c2d
[datafeed / fti] rather control a 'complete' parameter than setting empty attribute values
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7697
diff
changeset
|
394 |
sourceparams=sourceparams) |
8695
358d8bed9626
[toward-py3k] rewrite to "except AnException as exc:" (part of #2711624)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
8694
diff
changeset
|
395 |
except ValidationError as ex: |
10089
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
396 |
if raise_on_error: |
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
397 |
raise |
7950
99ae8c883ad3
[ms repository] when an exception is raised during extid2eid and no rollback is done, some manual cleanups have to be done (closes #1993420)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7934
diff
changeset
|
398 |
self.source.critical('error while creating %s: %s', etype, ex) |
8069
4341fb713b14
[datafeed log]Â properly log errors catched at the source level
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8068
diff
changeset
|
399 |
self.import_log.record_error('error while creating %s: %s' |
4341fb713b14
[datafeed log]Â properly log errors catched at the source level
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8068
diff
changeset
|
400 |
% (etype, ex)) |
7590
a9aad6c25836
closes #1793991: [datafeed] error handling: log unexpected exceptions / don't stop on first validation error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7552
diff
changeset
|
401 |
return None |
7399
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
402 |
if eid < 0: |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
403 |
# entity has been moved away from its original source |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
404 |
# |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
405 |
# Don't give etype to entity_from_eid so we get UnknownEid if the |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
406 |
# entity has been removed |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
407 |
try: |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
408 |
entity = cnx.entity_from_eid(-eid) |
7399
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
409 |
except UnknownEid: |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
410 |
return None |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
411 |
self.notify_updated(entity) # avoid later update from the source's data |
972ed1843bd8
[multi-sources] support for moving an entity from an external source (closes #343818)
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7398
diff
changeset
|
412 |
return entity |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
413 |
if self.sourceuris is not None: |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
414 |
self.sourceuris.pop(str(uri), None) |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
415 |
return cnx.entity_from_eid(eid, etype) |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
416 |
|
8409
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
417 |
def process(self, url, raise_on_error=False): |
6957
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
418 |
"""main callback: process the url""" |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
419 |
raise NotImplementedError |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
420 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
421 |
def before_entity_copy(self, entity, sourceparams): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
422 |
raise NotImplementedError |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
423 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
424 |
def after_entity_copy(self, entity, sourceparams): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
425 |
self.stats['created'].add(entity.eid) |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
426 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
427 |
def created_during_pull(self, entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
428 |
return entity.eid in self.stats['created'] |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
429 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
430 |
def updated_during_pull(self, entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
431 |
return entity.eid in self.stats['updated'] |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
432 |
|
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
433 |
def notify_updated(self, entity): |
ffda12be2e9f
[repository] #1460066: backport datafeed cube as cubicweb source
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
diff
changeset
|
434 |
return self.stats['updated'].add(entity.eid) |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
435 |
|
8435
5064b6e0d6f4
[datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8434
diff
changeset
|
436 |
def notify_checked(self, entity): |
5064b6e0d6f4
[datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8434
diff
changeset
|
437 |
return self.stats['checked'].add(entity.eid) |
5064b6e0d6f4
[datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8434
diff
changeset
|
438 |
|
8187
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
439 |
def is_deleted(self, extid, etype, eid): |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
440 |
"""return True if the entity of given external id, entity type and eid |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
441 |
is actually deleted. Always return True by default, put more sensible |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
442 |
stuff in sub-classes. |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
443 |
""" |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
444 |
return True |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
445 |
|
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
446 |
def handle_deletion(self, config, cnx, myuris): |
8430
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
447 |
if config['delete-entities'] and myuris: |
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
448 |
byetype = {} |
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
449 |
for extid, (eid, etype) in myuris.iteritems(): |
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
450 |
if self.is_deleted(extid, etype, eid): |
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
451 |
byetype.setdefault(etype, []).append(str(eid)) |
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
452 |
for etype, eids in byetype.iteritems(): |
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
453 |
self.warning('delete %s %s entities', len(eids), etype) |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
454 |
cnx.execute('DELETE %s X WHERE X eid IN (%s)' |
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
455 |
% (etype, ','.join(eids))) |
9975
98b4f7fa2e3a
[datafeed] Commit after all deletions in datafeed parser
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9879
diff
changeset
|
456 |
cnx.commit() |
8430
5bee87a14bb1
fix ldap removal handling in ldapfeed (closes #2376625 and #2385133)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8429
diff
changeset
|
457 |
|
8188
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
458 |
def update_if_necessary(self, entity, attrs): |
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
459 |
entity.complete(tuple(attrs)) |
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
460 |
# check modification date and compare attribute values to only update |
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
461 |
# what's actually needed |
8435
5064b6e0d6f4
[datafeed] correctly distinguish checked/updated
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8434
diff
changeset
|
462 |
self.notify_checked(entity) |
8188
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
463 |
mdate = attrs.get('modification_date') |
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
464 |
if not mdate or mdate > entity.modification_date: |
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
465 |
attrs = dict( (k, v) for k, v in attrs.iteritems() |
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
466 |
if v != getattr(entity, k)) |
1867e252e487
[repository] ldap-feed source. Closes #2086984
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8187
diff
changeset
|
467 |
if attrs: |
8483
4ba11607d84a
[entity api] unify set_attributes / set_relations into a cw_set method. Closes #2423719
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8435
diff
changeset
|
468 |
entity.cw_set(**attrs) |
8434
39c5bb4dcc59
[ldapfeed] do not crash on ldap user deletion + pull + already deactivated users, cleanups (closes #2392933)
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
8430
diff
changeset
|
469 |
self.notify_updated(entity) |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
470 |
|
8547
f23ac525ddd1
[datafeed] properly call hooks for inlined relations on entity creation. Closes #2481156
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8529
diff
changeset
|
471 |
|
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
472 |
class DataFeedXMLParser(DataFeedParser): |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
473 |
|
8409
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
474 |
def process(self, url, raise_on_error=False): |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
475 |
"""IDataFeedParser main entry point""" |
7447
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
476 |
try: |
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
477 |
parsed = self.parse(url) |
8695
358d8bed9626
[toward-py3k] rewrite to "except AnException as exc:" (part of #2711624)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
8694
diff
changeset
|
478 |
except Exception as ex: |
7533
43835fbdf97d
[datafeed] actually raise on error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7527
diff
changeset
|
479 |
if raise_on_error: |
43835fbdf97d
[datafeed] actually raise on error
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7527
diff
changeset
|
480 |
raise |
8069
4341fb713b14
[datafeed log]Â properly log errors catched at the source level
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8068
diff
changeset
|
481 |
self.import_log.record_error(str(ex)) |
7447
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
482 |
return True |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
483 |
error = False |
9879
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
484 |
commit = self._cw.commit |
21278eb03bbf
[datafeed sources] finish the session -> cnx switch
Aurelien Campeas <aurelien.campeas@logilab.fr>
parents:
9860
diff
changeset
|
485 |
rollback = self._cw.rollback |
7447
d5705c9bbe82
don't crash if we can't fetch data or if xml is malformed
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7446
diff
changeset
|
486 |
for args in parsed: |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
487 |
try: |
10089
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
488 |
self.process_item(*args, raise_on_error=raise_on_error) |
8409
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
489 |
# commit+set_cnxset instead of commit(free_cnxset=False) to let |
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
490 |
# other a chance to get our connections set |
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
491 |
commit() |
8695
358d8bed9626
[toward-py3k] rewrite to "except AnException as exc:" (part of #2711624)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
8694
diff
changeset
|
492 |
except ValidationError as exc: |
7385
29f050e39b09
[datafeed] propagate raise_on_error to parser's process method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7379
diff
changeset
|
493 |
if raise_on_error: |
29f050e39b09
[datafeed] propagate raise_on_error to parser's process method
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7379
diff
changeset
|
494 |
raise |
8409
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
495 |
self.source.error('Skipping %s because of validation error %s' |
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
496 |
% (args, exc)) |
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
497 |
rollback() |
79534887943e
[datafeed] fix/finish cleanup started by auc in 8393:c25b96ae4f8a: parser.process prototytpe is (url, raise_on_error=False). Drop partialcommit argument which were never specified
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8408
diff
changeset
|
498 |
error = True |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
499 |
return error |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
500 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
501 |
def parse(self, url): |
9823
258d2f9f7d39
[datafeed parser] factor out retrieve_url method from DataFeedXMLParser.parse
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9822
diff
changeset
|
502 |
stream = self.retrieve_url(url) |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
503 |
return self.parse_etree(etree.parse(stream).getroot()) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
504 |
|
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
505 |
def parse_etree(self, document): |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
506 |
return [(document,)] |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
507 |
|
10089
6346f53c85f1
[datafeed] Add a raise_on_error parameter to DataFeedSource.extid2entity
Denis Laxalde <denis.laxalde@logilab.fr>
parents:
9990
diff
changeset
|
508 |
def process_item(self, *args, **kwargs): |
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
509 |
raise NotImplementedError |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
510 |
|
8187
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
511 |
def is_deleted(self, extid, etype, eid): |
10551
1182f5f16a3d
[datafeed] fix typo in DataFeedXMLParser.is_deleted (closes #5729755)
David Douard <david.douard@logilab.fr>
parents:
10532
diff
changeset
|
512 |
if extid.startswith('file://'): |
1182f5f16a3d
[datafeed] fix typo in DataFeedXMLParser.is_deleted (closes #5729755)
David Douard <david.douard@logilab.fr>
parents:
10532
diff
changeset
|
513 |
return exists(extid[7:]) |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
514 |
|
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
515 |
url = self.normalize_url(extid) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
516 |
# first, try to use cwclientlib if it's available and if the |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
517 |
# url matches a configuration entry in ~/.config/cwclientlibrc |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
518 |
try: |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
519 |
from cwclientlib import cwproxy_for |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
520 |
# parse url again since it has been normalized |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
521 |
cnx = cwproxy_for(url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
522 |
cnx.timeout = self.source.http_timeout |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
523 |
self.source.info('Using cwclientlib for checking %s' % url) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
524 |
return cnx.get(url).status_code == 404 |
10532
2cc74c688eb9
[datafeed] also catch EnvironmentError when trying to load the cwclientlib config file
David Douard <david.douard@logilab.fr>
parents:
10522
diff
changeset
|
525 |
except (ImportError, ValueError, EnvironmentError) as exc: |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
526 |
# ImportError: not available |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
527 |
# ValueError: no config entry found |
10532
2cc74c688eb9
[datafeed] also catch EnvironmentError when trying to load the cwclientlib config file
David Douard <david.douard@logilab.fr>
parents:
10522
diff
changeset
|
528 |
# EnvironmentError: no cwclientlib config file found |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
529 |
self.source.debug(str(exc)) |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
530 |
|
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
531 |
# no chance with cwclientlib, fall back to former implementation |
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
532 |
if urlparse.urlparse(url).scheme in ('http', 'https'): |
8187
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
533 |
try: |
10516
4c59409220b6
[datafeed] allow to use cwclientlib for datafeed's queries (closes #5456849)
David Douard <david.douard@logilab.fr>
parents:
10143
diff
changeset
|
534 |
_OPENER.open(url, timeout=self.source.http_timeout) |
8695
358d8bed9626
[toward-py3k] rewrite to "except AnException as exc:" (part of #2711624)
Nicolas Chauvat <nicolas.chauvat@logilab.fr>
parents:
8694
diff
changeset
|
535 |
except urllib2.HTTPError as ex: |
8187
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
536 |
if ex.code == 404: |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
537 |
return True |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
538 |
return False |
981f6e487788
[datafeed] set delete-entities=yes is now safer, by checking each entity actually seems deleted. Closes #2165381
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
8069
diff
changeset
|
539 |
|
9824
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
540 |
|
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
541 |
class URLLibResponseAdapter(object): |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
542 |
"""Thin wrapper to be used to fake a value returned by urllib2.urlopen""" |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
543 |
def __init__(self, stream, url, code=200): |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
544 |
self._stream = stream |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
545 |
self._url = url |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
546 |
self.code = code |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
547 |
|
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
548 |
def read(self, *args): |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
549 |
return self._stream.read(*args) |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
550 |
|
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
551 |
def geturl(self): |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
552 |
return self._url |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
553 |
|
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
554 |
def getcode(self): |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
555 |
return self.code |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
556 |
|
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
557 |
def info(self): |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
558 |
from mimetools import Message |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
559 |
return Message(StringIO.StringIO()) |
30183ecf5c61
[datafeed parser] fix retrieve_url to always return urllib2.urlopen compatible output
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
9823
diff
changeset
|
560 |
|
7378
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
561 |
# use a cookie enabled opener to use session cookie if any |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
562 |
_OPENER = urllib2.build_opener() |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
563 |
try: |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
564 |
from logilab.common import urllib2ext |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
565 |
_OPENER.add_handler(urllib2ext.HTTPGssapiAuthHandler()) |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
566 |
except ImportError: # python-kerberos not available |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
567 |
pass |
86a1ae289f05
[datafeed] extract a generic DataFeedXMLParser from CWEntityXMLParser
Sylvain Thénault <sylvain.thenault@logilab.fr>
parents:
7351
diff
changeset
|
568 |
_OPENER.add_handler(urllib2.HTTPCookieProcessor(CookieJar())) |