web/views/magicsearch.py
changeset 0 b97547f5f1fa
child 661 4f61eb8a96b7
equal deleted inserted replaced
-1:000000000000 0:b97547f5f1fa
       
     1 """a query preprocesser to handle quick search shortcuts for cubicweb
       
     2 
       
     3 
       
     4 :organization: Logilab
       
     5 :copyright: 2001-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
       
     6 :contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
       
     7 """
       
     8 
       
     9 __docformat__ = "restructuredtext en"
       
    10 
       
    11 import re
       
    12 from logging import getLogger
       
    13 
       
    14 from rql import RQLSyntaxError, BadRQLQuery, parse
       
    15 from rql.nodes import Relation
       
    16 
       
    17 from cubicweb import Unauthorized
       
    18 from cubicweb.common.appobject import Component, SingletonComponent
       
    19 
       
    20 LOGGER = getLogger('cubicweb.magicsearch')
       
    21 
       
    22 def _get_approriate_translation(translations_found, eschema):
       
    23     """return the first (should be the only one) possible translation according
       
    24     to the given entity type
       
    25     """
       
    26     # get the list of all attributes / relations for this kind of entity
       
    27     existing_relations = set(eschema.subject_relations())
       
    28     consistent_translations = translations_found & existing_relations
       
    29     if len(consistent_translations) == 0:
       
    30         return None
       
    31     return consistent_translations.pop()
       
    32 
       
    33 
       
    34 def translate_rql_tree(rqlst, translations, schema):
       
    35     """Try to translate each relation in the RQL syntax tree
       
    36 
       
    37     :type rqlst: `rql.stmts.Statement`
       
    38     :param rqlst: the RQL syntax tree
       
    39 
       
    40     :type translations: dict
       
    41     :param translations: the reverted l10n dict
       
    42 
       
    43     :type schema: `cubicweb.schema.Schema`
       
    44     :param schema: the application's schema    
       
    45     """
       
    46     # var_types is used as a map : var_name / var_type
       
    47     vartypes = {}
       
    48     # ambiguous_nodes is used as a map : relation_node / (var_name, available_translations)
       
    49     ambiguous_nodes = {}
       
    50     # For each relation node, check if it's a localized relation name
       
    51     # If it's a localized name, then use the original relation name, else
       
    52     # keep the existing relation name
       
    53     for relation in rqlst.get_nodes(Relation):
       
    54         rtype = relation.r_type
       
    55         lhs, rhs = relation.get_variable_parts()
       
    56         if rtype == 'is':
       
    57             try:
       
    58                 etype = translations[rhs.value]
       
    59                 rhs.value = etype
       
    60             except KeyError:
       
    61                 # If no translation found, leave the entity type as is
       
    62                 etype = rhs.value
       
    63             # Memorize variable's type
       
    64             vartypes[lhs.name] = etype
       
    65         else:
       
    66             try:
       
    67                 translation_set = translations[rtype]
       
    68             except KeyError:
       
    69                 pass # If no translation found, leave the relation type as is
       
    70             else:
       
    71                 # Only one possible translation, no ambiguity
       
    72                 if len(translation_set) == 1:
       
    73                     relation.r_type = iter(translations[rtype]).next()
       
    74                 # More than 1 possible translation => resolve it later
       
    75                 else:
       
    76                     ambiguous_nodes[relation] = (lhs.name, translation_set)
       
    77     if ambiguous_nodes:
       
    78         resolve_ambiguities(vartypes, ambiguous_nodes, schema)
       
    79 
       
    80 
       
    81 def resolve_ambiguities(var_types, ambiguous_nodes, schema):
       
    82     """Tries to resolve remaining ambiguities for translation
       
    83     /!\ An ambiguity is when two different string can be localized with
       
    84         the same string
       
    85     A simple example:
       
    86       - 'name' in a company context will be localized as 'nom' in French
       
    87       - but ... 'surname' will also be localized as 'nom'
       
    88 
       
    89     :type var_types: dict
       
    90     :param var_types: a map : var_name / var_type
       
    91 
       
    92     :type ambiguous_nodes: dict
       
    93     :param ambiguous_nodes: a map : relation_node / (var_name, available_translations)
       
    94 
       
    95     :type schema: `cubicweb.schema.Schema`
       
    96     :param schema: the application's schema
       
    97     """
       
    98     # Now, try to resolve ambiguous translations
       
    99     for relation, (var_name, translations_found) in ambiguous_nodes.items():
       
   100         try:
       
   101             vartype = var_types[var_name]
       
   102         except KeyError:
       
   103             continue
       
   104         # Get schema for this entity type
       
   105         eschema = schema.eschema(vartype)
       
   106         rtype = _get_approriate_translation(translations_found, eschema)
       
   107         if rtype is None:
       
   108             continue
       
   109         relation.r_type = rtype
       
   110     
       
   111 
       
   112 
       
   113 QUOTED_SRE = re.compile(r'(.*?)(["\'])(.+?)\2')
       
   114 
       
   115 TRANSLATION_MAPS = {}
       
   116 def trmap(config, schema, lang):
       
   117     try:
       
   118         return TRANSLATION_MAPS[lang]
       
   119     except KeyError:
       
   120         assert lang in config.translations, '%s %s' % (lang, config.translations)
       
   121         tr = config.translations[lang]
       
   122         langmap = {}
       
   123         for etype in schema.entities():
       
   124             etype = str(etype)
       
   125             langmap[tr(etype).capitalize()] = etype
       
   126             langmap[etype.capitalize()] = etype
       
   127         for rtype in schema.relations():
       
   128             rtype = str(rtype)
       
   129             langmap.setdefault(tr(rtype).lower(), set()).add(rtype)
       
   130             langmap.setdefault(rtype, set()).add(rtype)
       
   131         TRANSLATION_MAPS[lang] = langmap
       
   132         return langmap
       
   133 
       
   134 
       
   135 class BaseQueryProcessor(Component):
       
   136     __abstract__ = True
       
   137     id = 'magicsearch_processor'
       
   138     # set something if you want explicit component search facility for the
       
   139     # component
       
   140     name = None
       
   141 
       
   142     def process_query(self, uquery, req):
       
   143         args = self.preprocess_query(uquery, req)
       
   144         try:
       
   145             return req.execute(*args)
       
   146         finally:
       
   147             # rollback necessary to avoid leaving the connection in a bad state
       
   148             req.cnx.rollback() 
       
   149 
       
   150     def preprocess_query(self, uquery, req):
       
   151         raise NotImplementedError()
       
   152 
       
   153 
       
   154 
       
   155 
       
   156 class DoNotPreprocess(BaseQueryProcessor):
       
   157     """this one returns the raw query and should be placed in first position
       
   158     of the chain
       
   159     """
       
   160     name = 'rql'
       
   161     priority = 0
       
   162     def preprocess_query(self, uquery, req):
       
   163         return uquery,
       
   164     
       
   165 
       
   166 class QueryTranslator(BaseQueryProcessor):
       
   167     """ parses through rql and translates into schema language entity names 
       
   168     and attributes
       
   169     """
       
   170     priority = 2
       
   171     def preprocess_query(self, uquery, req):
       
   172         try:
       
   173             rqlst = parse(uquery, print_errors=False)
       
   174         except (RQLSyntaxError, BadRQLQuery), err:
       
   175             return uquery,
       
   176         schema = self.vreg.schema
       
   177         # rql syntax tree will be modified in place if necessary
       
   178         translate_rql_tree(rqlst, trmap(self.config, schema, req.lang), schema)
       
   179         return rqlst.as_string(),
       
   180 
       
   181 
       
   182 class QSPreProcessor(BaseQueryProcessor):
       
   183     """Quick search preprocessor
       
   184 
       
   185     preprocessing query in shortcut form to their RQL form
       
   186     """
       
   187     priority = 4
       
   188     
       
   189     def preprocess_query(self, uquery, req):
       
   190         """"""
       
   191         args = None
       
   192         self.req = req
       
   193         try:
       
   194             # Process as if there was a quoted part
       
   195             args = self._quoted_words_query(uquery)
       
   196         ## No quoted part  
       
   197         except BadRQLQuery:
       
   198             words = uquery.split()
       
   199             if len(words) == 1:
       
   200                 args = self._one_word_query(*words)
       
   201             elif len(words) == 2:
       
   202                 args = self._two_words_query(*words)
       
   203             elif len(words) == 3:
       
   204                 args = self._three_words_query(*words)
       
   205             else:
       
   206                 args = self._multiple_words_query(words)
       
   207         return args
       
   208     
       
   209     def _get_entity_type(self, word):
       
   210         """check if the given word is matching an entity type, return it if
       
   211         it's the case or raise BadRQLQuery if not
       
   212         """
       
   213         etype = word.capitalize()
       
   214         try:
       
   215             return trmap(self.config, self.vreg.schema, self.req.lang)[etype]
       
   216         except KeyError:
       
   217             raise BadRQLQuery('%s is not a valid entity name' % etype)        
       
   218 
       
   219     def _get_attribute_name(self, word, eschema):
       
   220         """check if the given word is matching an attribute of the given entity type,
       
   221         return it normalized if found or return it untransformed else
       
   222         """
       
   223         """Returns the attributes's name as stored in the DB"""
       
   224         # Need to convert from unicode to string (could be whatever)
       
   225         rtype = word.lower()
       
   226         # Find the entity name as stored in the DB
       
   227         translations = trmap(self.config, self.vreg.schema, self.req.lang)
       
   228         try:
       
   229             translations = translations[rtype]
       
   230         except KeyError:
       
   231             raise BadRQLQuery('%s is not a valid attribute for %s entity type'
       
   232                               % (word, eschema))
       
   233         rtype = _get_approriate_translation(translations, eschema)
       
   234         if rtype is None:
       
   235             raise BadRQLQuery('%s is not a valid attribute for %s entity type'
       
   236                               % (word, eschema))
       
   237         return rtype
       
   238 
       
   239     def _one_word_query(self, word):
       
   240         """Specific process for one word query (case (1) of preprocess_rql)
       
   241         """
       
   242         # if this is an integer, then directly go to eid
       
   243         try:
       
   244             eid = int(word)
       
   245             return 'Any X WHERE X eid %(x)s', {'x': eid}, 'x'
       
   246         except ValueError:
       
   247             etype = self._get_entity_type(word)
       
   248             return '%s %s' % (etype, etype[0]),
       
   249 
       
   250     def _complete_rql(self, searchstr, etype, rtype=None, var=None, searchattr=None):
       
   251         searchop = ''
       
   252         if '%' in searchstr:
       
   253             if rtype:
       
   254                 possible_etypes = self.schema.rschema(rtype).objects(etype)
       
   255             else:
       
   256                 possible_etypes = [self.schema.eschema(etype)]
       
   257             if searchattr or len(possible_etypes) == 1:
       
   258                 searchattr = searchattr or possible_etypes[0].main_attribute()
       
   259                 searchop = 'LIKE '
       
   260         searchattr = searchattr or 'has_text'
       
   261         if var is None:
       
   262             var = etype[0]
       
   263         return '%s %s %s%%(text)s' % (var, searchattr, searchop)
       
   264         
       
   265     def _two_words_query(self, word1, word2):
       
   266         """Specific process for two words query (case (2) of preprocess_rql)
       
   267         """
       
   268         etype = self._get_entity_type(word1)
       
   269         # this is a valid RQL query : ("Person X", or "Person TMP1")
       
   270         if len(word2) == 1 and word2.isupper():
       
   271             return '%s %s' % (etype, word2),
       
   272         # else, suppose it's a shortcut like : Person Smith
       
   273         rql = '%s %s WHERE %s' % (etype, etype[0], self._complete_rql(word2, etype))
       
   274         return rql, {'text': word2}
       
   275            
       
   276     def _three_words_query(self, word1, word2, word3):
       
   277         """Specific process for three words query (case (3) of preprocess_rql)
       
   278         """
       
   279         etype = self._get_entity_type(word1)
       
   280         eschema = self.schema.eschema(etype)
       
   281         rtype = self._get_attribute_name(word2, eschema)
       
   282         # expand shortcut if rtype is a non final relation
       
   283         if not self.schema.rschema(rtype).is_final():
       
   284             return self._expand_shortcut(etype, rtype, word3)
       
   285         if '%' in word3:
       
   286             searchop = 'LIKE '
       
   287         else:
       
   288             searchop = ''
       
   289         rql = '%s %s WHERE %s' % (etype, etype[0],
       
   290                                   self._complete_rql(word3, etype, searchattr=rtype))
       
   291         return rql, {'text': word3}
       
   292 
       
   293     def _multiple_words_query(self, words):
       
   294         """specific process for more than 3 words query"""
       
   295         return ' '.join(words),
       
   296 
       
   297 
       
   298     def _expand_shortcut(self, etype, rtype, searchstr):
       
   299         """Expands shortcut queries on a non final relation to use has_text or
       
   300         the main attribute (according to possible entity type) if '%' is used in the
       
   301         search word
       
   302 
       
   303         Transforms : 'person worksat IBM' into
       
   304         'Personne P WHERE P worksAt C, C has_text "IBM"'
       
   305         """
       
   306         # check out all possilbe entity types for the relation represented
       
   307         # by 'rtype'
       
   308         mainvar = etype[0]
       
   309         searchvar = mainvar  + '1'
       
   310         rql =  '%s %s WHERE %s %s %s, %s' % (etype, mainvar,  # Person P
       
   311                                              mainvar, rtype, searchvar, # P worksAt C
       
   312                                              self._complete_rql(searchstr, etype,
       
   313                                                                 rtype=rtype, var=searchvar))
       
   314         return rql, {'text': searchstr}
       
   315 
       
   316 
       
   317     def _quoted_words_query(self, ori_rql):
       
   318         """Specific process when there's a "quoted" part
       
   319         """
       
   320         m = QUOTED_SRE.match(ori_rql)
       
   321         # if there's no quoted part, then no special pre-processing to do
       
   322         if m is None:
       
   323             raise BadRQLQuery("unable to handle request %r" % ori_rql)
       
   324         left_words = m.group(1).split()
       
   325         quoted_part = m.group(3)
       
   326         # Case (1) : Company "My own company"
       
   327         if len(left_words) == 1:
       
   328             try:
       
   329                 word1 = left_words[0]
       
   330                 return self._two_words_query(word1, quoted_part)
       
   331             except BadRQLQuery, error:
       
   332                 raise BadRQLQuery("unable to handle request %r" % ori_rql)
       
   333         # Case (2) : Company name "My own company";
       
   334         elif len(left_words) == 2:
       
   335             word1, word2 = left_words
       
   336             return self._three_words_query(word1, word2, quoted_part)
       
   337             # return ori_rql
       
   338         raise BadRQLQuery("unable to handle request %r" % ori_rql)
       
   339     
       
   340 
       
   341  
       
   342 class FullTextTranslator(BaseQueryProcessor):
       
   343     priority = 10
       
   344     name = 'text'
       
   345     
       
   346     def preprocess_query(self, uquery, req):
       
   347         """suppose it's a plain text query"""
       
   348         return 'Any X WHERE X has_text %(text)s', {'text': uquery}
       
   349 
       
   350 
       
   351 
       
   352 class MagicSearchComponent(SingletonComponent):
       
   353     id  = 'magicsearch'
       
   354     def __init__(self, req, rset=None):
       
   355         super(MagicSearchComponent, self).__init__(req, rset)
       
   356         processors = []
       
   357         self.by_name = {}
       
   358         for processorcls in self.vreg.registry_objects('components',
       
   359                                                        'magicsearch_processor'):
       
   360             # instantiation needed
       
   361             processor = processorcls()
       
   362             processors.append(processor)
       
   363             if processor.name is not None:
       
   364                 assert not processor.name in self.by_name
       
   365                 self.by_name[processor.name.lower()] = processor
       
   366         self.processors = sorted(processors, key=lambda x: x.priority)
       
   367 
       
   368     def process_query(self, uquery, req):
       
   369         assert isinstance(uquery, unicode)
       
   370         try:
       
   371             procname, query = uquery.split(':', 1)
       
   372             proc = self.by_name[procname.strip().lower()]
       
   373             uquery = query.strip()
       
   374         except:
       
   375             # use processor chain
       
   376             unauthorized = None
       
   377             for proc in self.processors:
       
   378                 try:
       
   379                     return proc.process_query(uquery, req)
       
   380                 # FIXME : we don't want to catch any exception type here !
       
   381                 except (RQLSyntaxError, BadRQLQuery):
       
   382                     pass
       
   383                 except Unauthorized, ex:
       
   384                     unauthorized = ex
       
   385                     continue
       
   386                 except Exception, ex:
       
   387                     LOGGER.debug('%s: %s', ex.__class__.__name__, ex)
       
   388                     continue
       
   389             if unauthorized:
       
   390                 raise unauthorized
       
   391         else:
       
   392             # let exception propagate
       
   393             return proc.process_query(uquery, req)
       
   394         raise BadRQLQuery(req._('sorry, the server is unable to handle this query'))
       
   395 
       
   396 
       
   397 # Do not make a strong dependency on NlpTools
       
   398 try:
       
   399     from NlpTools.rqltools.client import RQLClient
       
   400 except ImportError:
       
   401     LOGGER.info('could not import RQLClient (NlpTools)')
       
   402 else:
       
   403     try:
       
   404         from Pyro.errors import NamingError
       
   405     except ImportError:
       
   406         LOGGER.warning("pyro is not installed, can't try to connect to nlp server")
       
   407     else:
       
   408         try:
       
   409             class NLPProcessor(BaseQueryProcessor):
       
   410                 priority = 8
       
   411                 nlp_agent = RQLClient('ivan')
       
   412                 def preprocess_query(self, uquery, req):
       
   413                     try:
       
   414                         answer = self.nlp_agent.get_translation(uquery)
       
   415                         if not answer:
       
   416                             raise BadRQLQuery(uquery)
       
   417                         return answer or uquery,
       
   418                     except Exception, ex:
       
   419                         LOGGER.exception(str(ex))
       
   420                         return uquery,
       
   421 
       
   422         except NamingError: # NlpTools available but no server registered
       
   423             LOGGER.warning('could not find any RQLServer object named "ivan"')
       
   424