|
1 """a query preprocesser to handle quick search shortcuts for cubicweb |
|
2 |
|
3 |
|
4 :organization: Logilab |
|
5 :copyright: 2001-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved. |
|
6 :contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr |
|
7 """ |
|
8 |
|
9 __docformat__ = "restructuredtext en" |
|
10 |
|
11 import re |
|
12 from logging import getLogger |
|
13 |
|
14 from rql import RQLSyntaxError, BadRQLQuery, parse |
|
15 from rql.nodes import Relation |
|
16 |
|
17 from cubicweb import Unauthorized |
|
18 from cubicweb.common.appobject import Component, SingletonComponent |
|
19 |
|
20 LOGGER = getLogger('cubicweb.magicsearch') |
|
21 |
|
22 def _get_approriate_translation(translations_found, eschema): |
|
23 """return the first (should be the only one) possible translation according |
|
24 to the given entity type |
|
25 """ |
|
26 # get the list of all attributes / relations for this kind of entity |
|
27 existing_relations = set(eschema.subject_relations()) |
|
28 consistent_translations = translations_found & existing_relations |
|
29 if len(consistent_translations) == 0: |
|
30 return None |
|
31 return consistent_translations.pop() |
|
32 |
|
33 |
|
34 def translate_rql_tree(rqlst, translations, schema): |
|
35 """Try to translate each relation in the RQL syntax tree |
|
36 |
|
37 :type rqlst: `rql.stmts.Statement` |
|
38 :param rqlst: the RQL syntax tree |
|
39 |
|
40 :type translations: dict |
|
41 :param translations: the reverted l10n dict |
|
42 |
|
43 :type schema: `cubicweb.schema.Schema` |
|
44 :param schema: the application's schema |
|
45 """ |
|
46 # var_types is used as a map : var_name / var_type |
|
47 vartypes = {} |
|
48 # ambiguous_nodes is used as a map : relation_node / (var_name, available_translations) |
|
49 ambiguous_nodes = {} |
|
50 # For each relation node, check if it's a localized relation name |
|
51 # If it's a localized name, then use the original relation name, else |
|
52 # keep the existing relation name |
|
53 for relation in rqlst.get_nodes(Relation): |
|
54 rtype = relation.r_type |
|
55 lhs, rhs = relation.get_variable_parts() |
|
56 if rtype == 'is': |
|
57 try: |
|
58 etype = translations[rhs.value] |
|
59 rhs.value = etype |
|
60 except KeyError: |
|
61 # If no translation found, leave the entity type as is |
|
62 etype = rhs.value |
|
63 # Memorize variable's type |
|
64 vartypes[lhs.name] = etype |
|
65 else: |
|
66 try: |
|
67 translation_set = translations[rtype] |
|
68 except KeyError: |
|
69 pass # If no translation found, leave the relation type as is |
|
70 else: |
|
71 # Only one possible translation, no ambiguity |
|
72 if len(translation_set) == 1: |
|
73 relation.r_type = iter(translations[rtype]).next() |
|
74 # More than 1 possible translation => resolve it later |
|
75 else: |
|
76 ambiguous_nodes[relation] = (lhs.name, translation_set) |
|
77 if ambiguous_nodes: |
|
78 resolve_ambiguities(vartypes, ambiguous_nodes, schema) |
|
79 |
|
80 |
|
81 def resolve_ambiguities(var_types, ambiguous_nodes, schema): |
|
82 """Tries to resolve remaining ambiguities for translation |
|
83 /!\ An ambiguity is when two different string can be localized with |
|
84 the same string |
|
85 A simple example: |
|
86 - 'name' in a company context will be localized as 'nom' in French |
|
87 - but ... 'surname' will also be localized as 'nom' |
|
88 |
|
89 :type var_types: dict |
|
90 :param var_types: a map : var_name / var_type |
|
91 |
|
92 :type ambiguous_nodes: dict |
|
93 :param ambiguous_nodes: a map : relation_node / (var_name, available_translations) |
|
94 |
|
95 :type schema: `cubicweb.schema.Schema` |
|
96 :param schema: the application's schema |
|
97 """ |
|
98 # Now, try to resolve ambiguous translations |
|
99 for relation, (var_name, translations_found) in ambiguous_nodes.items(): |
|
100 try: |
|
101 vartype = var_types[var_name] |
|
102 except KeyError: |
|
103 continue |
|
104 # Get schema for this entity type |
|
105 eschema = schema.eschema(vartype) |
|
106 rtype = _get_approriate_translation(translations_found, eschema) |
|
107 if rtype is None: |
|
108 continue |
|
109 relation.r_type = rtype |
|
110 |
|
111 |
|
112 |
|
113 QUOTED_SRE = re.compile(r'(.*?)(["\'])(.+?)\2') |
|
114 |
|
115 TRANSLATION_MAPS = {} |
|
116 def trmap(config, schema, lang): |
|
117 try: |
|
118 return TRANSLATION_MAPS[lang] |
|
119 except KeyError: |
|
120 assert lang in config.translations, '%s %s' % (lang, config.translations) |
|
121 tr = config.translations[lang] |
|
122 langmap = {} |
|
123 for etype in schema.entities(): |
|
124 etype = str(etype) |
|
125 langmap[tr(etype).capitalize()] = etype |
|
126 langmap[etype.capitalize()] = etype |
|
127 for rtype in schema.relations(): |
|
128 rtype = str(rtype) |
|
129 langmap.setdefault(tr(rtype).lower(), set()).add(rtype) |
|
130 langmap.setdefault(rtype, set()).add(rtype) |
|
131 TRANSLATION_MAPS[lang] = langmap |
|
132 return langmap |
|
133 |
|
134 |
|
135 class BaseQueryProcessor(Component): |
|
136 __abstract__ = True |
|
137 id = 'magicsearch_processor' |
|
138 # set something if you want explicit component search facility for the |
|
139 # component |
|
140 name = None |
|
141 |
|
142 def process_query(self, uquery, req): |
|
143 args = self.preprocess_query(uquery, req) |
|
144 try: |
|
145 return req.execute(*args) |
|
146 finally: |
|
147 # rollback necessary to avoid leaving the connection in a bad state |
|
148 req.cnx.rollback() |
|
149 |
|
150 def preprocess_query(self, uquery, req): |
|
151 raise NotImplementedError() |
|
152 |
|
153 |
|
154 |
|
155 |
|
156 class DoNotPreprocess(BaseQueryProcessor): |
|
157 """this one returns the raw query and should be placed in first position |
|
158 of the chain |
|
159 """ |
|
160 name = 'rql' |
|
161 priority = 0 |
|
162 def preprocess_query(self, uquery, req): |
|
163 return uquery, |
|
164 |
|
165 |
|
166 class QueryTranslator(BaseQueryProcessor): |
|
167 """ parses through rql and translates into schema language entity names |
|
168 and attributes |
|
169 """ |
|
170 priority = 2 |
|
171 def preprocess_query(self, uquery, req): |
|
172 try: |
|
173 rqlst = parse(uquery, print_errors=False) |
|
174 except (RQLSyntaxError, BadRQLQuery), err: |
|
175 return uquery, |
|
176 schema = self.vreg.schema |
|
177 # rql syntax tree will be modified in place if necessary |
|
178 translate_rql_tree(rqlst, trmap(self.config, schema, req.lang), schema) |
|
179 return rqlst.as_string(), |
|
180 |
|
181 |
|
182 class QSPreProcessor(BaseQueryProcessor): |
|
183 """Quick search preprocessor |
|
184 |
|
185 preprocessing query in shortcut form to their RQL form |
|
186 """ |
|
187 priority = 4 |
|
188 |
|
189 def preprocess_query(self, uquery, req): |
|
190 """""" |
|
191 args = None |
|
192 self.req = req |
|
193 try: |
|
194 # Process as if there was a quoted part |
|
195 args = self._quoted_words_query(uquery) |
|
196 ## No quoted part |
|
197 except BadRQLQuery: |
|
198 words = uquery.split() |
|
199 if len(words) == 1: |
|
200 args = self._one_word_query(*words) |
|
201 elif len(words) == 2: |
|
202 args = self._two_words_query(*words) |
|
203 elif len(words) == 3: |
|
204 args = self._three_words_query(*words) |
|
205 else: |
|
206 args = self._multiple_words_query(words) |
|
207 return args |
|
208 |
|
209 def _get_entity_type(self, word): |
|
210 """check if the given word is matching an entity type, return it if |
|
211 it's the case or raise BadRQLQuery if not |
|
212 """ |
|
213 etype = word.capitalize() |
|
214 try: |
|
215 return trmap(self.config, self.vreg.schema, self.req.lang)[etype] |
|
216 except KeyError: |
|
217 raise BadRQLQuery('%s is not a valid entity name' % etype) |
|
218 |
|
219 def _get_attribute_name(self, word, eschema): |
|
220 """check if the given word is matching an attribute of the given entity type, |
|
221 return it normalized if found or return it untransformed else |
|
222 """ |
|
223 """Returns the attributes's name as stored in the DB""" |
|
224 # Need to convert from unicode to string (could be whatever) |
|
225 rtype = word.lower() |
|
226 # Find the entity name as stored in the DB |
|
227 translations = trmap(self.config, self.vreg.schema, self.req.lang) |
|
228 try: |
|
229 translations = translations[rtype] |
|
230 except KeyError: |
|
231 raise BadRQLQuery('%s is not a valid attribute for %s entity type' |
|
232 % (word, eschema)) |
|
233 rtype = _get_approriate_translation(translations, eschema) |
|
234 if rtype is None: |
|
235 raise BadRQLQuery('%s is not a valid attribute for %s entity type' |
|
236 % (word, eschema)) |
|
237 return rtype |
|
238 |
|
239 def _one_word_query(self, word): |
|
240 """Specific process for one word query (case (1) of preprocess_rql) |
|
241 """ |
|
242 # if this is an integer, then directly go to eid |
|
243 try: |
|
244 eid = int(word) |
|
245 return 'Any X WHERE X eid %(x)s', {'x': eid}, 'x' |
|
246 except ValueError: |
|
247 etype = self._get_entity_type(word) |
|
248 return '%s %s' % (etype, etype[0]), |
|
249 |
|
250 def _complete_rql(self, searchstr, etype, rtype=None, var=None, searchattr=None): |
|
251 searchop = '' |
|
252 if '%' in searchstr: |
|
253 if rtype: |
|
254 possible_etypes = self.schema.rschema(rtype).objects(etype) |
|
255 else: |
|
256 possible_etypes = [self.schema.eschema(etype)] |
|
257 if searchattr or len(possible_etypes) == 1: |
|
258 searchattr = searchattr or possible_etypes[0].main_attribute() |
|
259 searchop = 'LIKE ' |
|
260 searchattr = searchattr or 'has_text' |
|
261 if var is None: |
|
262 var = etype[0] |
|
263 return '%s %s %s%%(text)s' % (var, searchattr, searchop) |
|
264 |
|
265 def _two_words_query(self, word1, word2): |
|
266 """Specific process for two words query (case (2) of preprocess_rql) |
|
267 """ |
|
268 etype = self._get_entity_type(word1) |
|
269 # this is a valid RQL query : ("Person X", or "Person TMP1") |
|
270 if len(word2) == 1 and word2.isupper(): |
|
271 return '%s %s' % (etype, word2), |
|
272 # else, suppose it's a shortcut like : Person Smith |
|
273 rql = '%s %s WHERE %s' % (etype, etype[0], self._complete_rql(word2, etype)) |
|
274 return rql, {'text': word2} |
|
275 |
|
276 def _three_words_query(self, word1, word2, word3): |
|
277 """Specific process for three words query (case (3) of preprocess_rql) |
|
278 """ |
|
279 etype = self._get_entity_type(word1) |
|
280 eschema = self.schema.eschema(etype) |
|
281 rtype = self._get_attribute_name(word2, eschema) |
|
282 # expand shortcut if rtype is a non final relation |
|
283 if not self.schema.rschema(rtype).is_final(): |
|
284 return self._expand_shortcut(etype, rtype, word3) |
|
285 if '%' in word3: |
|
286 searchop = 'LIKE ' |
|
287 else: |
|
288 searchop = '' |
|
289 rql = '%s %s WHERE %s' % (etype, etype[0], |
|
290 self._complete_rql(word3, etype, searchattr=rtype)) |
|
291 return rql, {'text': word3} |
|
292 |
|
293 def _multiple_words_query(self, words): |
|
294 """specific process for more than 3 words query""" |
|
295 return ' '.join(words), |
|
296 |
|
297 |
|
298 def _expand_shortcut(self, etype, rtype, searchstr): |
|
299 """Expands shortcut queries on a non final relation to use has_text or |
|
300 the main attribute (according to possible entity type) if '%' is used in the |
|
301 search word |
|
302 |
|
303 Transforms : 'person worksat IBM' into |
|
304 'Personne P WHERE P worksAt C, C has_text "IBM"' |
|
305 """ |
|
306 # check out all possilbe entity types for the relation represented |
|
307 # by 'rtype' |
|
308 mainvar = etype[0] |
|
309 searchvar = mainvar + '1' |
|
310 rql = '%s %s WHERE %s %s %s, %s' % (etype, mainvar, # Person P |
|
311 mainvar, rtype, searchvar, # P worksAt C |
|
312 self._complete_rql(searchstr, etype, |
|
313 rtype=rtype, var=searchvar)) |
|
314 return rql, {'text': searchstr} |
|
315 |
|
316 |
|
317 def _quoted_words_query(self, ori_rql): |
|
318 """Specific process when there's a "quoted" part |
|
319 """ |
|
320 m = QUOTED_SRE.match(ori_rql) |
|
321 # if there's no quoted part, then no special pre-processing to do |
|
322 if m is None: |
|
323 raise BadRQLQuery("unable to handle request %r" % ori_rql) |
|
324 left_words = m.group(1).split() |
|
325 quoted_part = m.group(3) |
|
326 # Case (1) : Company "My own company" |
|
327 if len(left_words) == 1: |
|
328 try: |
|
329 word1 = left_words[0] |
|
330 return self._two_words_query(word1, quoted_part) |
|
331 except BadRQLQuery, error: |
|
332 raise BadRQLQuery("unable to handle request %r" % ori_rql) |
|
333 # Case (2) : Company name "My own company"; |
|
334 elif len(left_words) == 2: |
|
335 word1, word2 = left_words |
|
336 return self._three_words_query(word1, word2, quoted_part) |
|
337 # return ori_rql |
|
338 raise BadRQLQuery("unable to handle request %r" % ori_rql) |
|
339 |
|
340 |
|
341 |
|
342 class FullTextTranslator(BaseQueryProcessor): |
|
343 priority = 10 |
|
344 name = 'text' |
|
345 |
|
346 def preprocess_query(self, uquery, req): |
|
347 """suppose it's a plain text query""" |
|
348 return 'Any X WHERE X has_text %(text)s', {'text': uquery} |
|
349 |
|
350 |
|
351 |
|
352 class MagicSearchComponent(SingletonComponent): |
|
353 id = 'magicsearch' |
|
354 def __init__(self, req, rset=None): |
|
355 super(MagicSearchComponent, self).__init__(req, rset) |
|
356 processors = [] |
|
357 self.by_name = {} |
|
358 for processorcls in self.vreg.registry_objects('components', |
|
359 'magicsearch_processor'): |
|
360 # instantiation needed |
|
361 processor = processorcls() |
|
362 processors.append(processor) |
|
363 if processor.name is not None: |
|
364 assert not processor.name in self.by_name |
|
365 self.by_name[processor.name.lower()] = processor |
|
366 self.processors = sorted(processors, key=lambda x: x.priority) |
|
367 |
|
368 def process_query(self, uquery, req): |
|
369 assert isinstance(uquery, unicode) |
|
370 try: |
|
371 procname, query = uquery.split(':', 1) |
|
372 proc = self.by_name[procname.strip().lower()] |
|
373 uquery = query.strip() |
|
374 except: |
|
375 # use processor chain |
|
376 unauthorized = None |
|
377 for proc in self.processors: |
|
378 try: |
|
379 return proc.process_query(uquery, req) |
|
380 # FIXME : we don't want to catch any exception type here ! |
|
381 except (RQLSyntaxError, BadRQLQuery): |
|
382 pass |
|
383 except Unauthorized, ex: |
|
384 unauthorized = ex |
|
385 continue |
|
386 except Exception, ex: |
|
387 LOGGER.debug('%s: %s', ex.__class__.__name__, ex) |
|
388 continue |
|
389 if unauthorized: |
|
390 raise unauthorized |
|
391 else: |
|
392 # let exception propagate |
|
393 return proc.process_query(uquery, req) |
|
394 raise BadRQLQuery(req._('sorry, the server is unable to handle this query')) |
|
395 |
|
396 |
|
397 # Do not make a strong dependency on NlpTools |
|
398 try: |
|
399 from NlpTools.rqltools.client import RQLClient |
|
400 except ImportError: |
|
401 LOGGER.info('could not import RQLClient (NlpTools)') |
|
402 else: |
|
403 try: |
|
404 from Pyro.errors import NamingError |
|
405 except ImportError: |
|
406 LOGGER.warning("pyro is not installed, can't try to connect to nlp server") |
|
407 else: |
|
408 try: |
|
409 class NLPProcessor(BaseQueryProcessor): |
|
410 priority = 8 |
|
411 nlp_agent = RQLClient('ivan') |
|
412 def preprocess_query(self, uquery, req): |
|
413 try: |
|
414 answer = self.nlp_agent.get_translation(uquery) |
|
415 if not answer: |
|
416 raise BadRQLQuery(uquery) |
|
417 return answer or uquery, |
|
418 except Exception, ex: |
|
419 LOGGER.exception(str(ex)) |
|
420 return uquery, |
|
421 |
|
422 except NamingError: # NlpTools available but no server registered |
|
423 LOGGER.warning('could not find any RQLServer object named "ivan"') |
|
424 |