diff -r 000000000000 -r b97547f5f1fa web/views/magicsearch.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/views/magicsearch.py Wed Nov 05 15:52:50 2008 +0100 @@ -0,0 +1,424 @@ +"""a query preprocesser to handle quick search shortcuts for cubicweb + + +:organization: Logilab +:copyright: 2001-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved. +:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr +""" + +__docformat__ = "restructuredtext en" + +import re +from logging import getLogger + +from rql import RQLSyntaxError, BadRQLQuery, parse +from rql.nodes import Relation + +from cubicweb import Unauthorized +from cubicweb.common.appobject import Component, SingletonComponent + +LOGGER = getLogger('cubicweb.magicsearch') + +def _get_approriate_translation(translations_found, eschema): + """return the first (should be the only one) possible translation according + to the given entity type + """ + # get the list of all attributes / relations for this kind of entity + existing_relations = set(eschema.subject_relations()) + consistent_translations = translations_found & existing_relations + if len(consistent_translations) == 0: + return None + return consistent_translations.pop() + + +def translate_rql_tree(rqlst, translations, schema): + """Try to translate each relation in the RQL syntax tree + + :type rqlst: `rql.stmts.Statement` + :param rqlst: the RQL syntax tree + + :type translations: dict + :param translations: the reverted l10n dict + + :type schema: `cubicweb.schema.Schema` + :param schema: the application's schema + """ + # var_types is used as a map : var_name / var_type + vartypes = {} + # ambiguous_nodes is used as a map : relation_node / (var_name, available_translations) + ambiguous_nodes = {} + # For each relation node, check if it's a localized relation name + # If it's a localized name, then use the original relation name, else + # keep the existing relation name + for relation in rqlst.get_nodes(Relation): + rtype = relation.r_type + lhs, rhs = relation.get_variable_parts() + if rtype == 'is': + try: + etype = translations[rhs.value] + rhs.value = etype + except KeyError: + # If no translation found, leave the entity type as is + etype = rhs.value + # Memorize variable's type + vartypes[lhs.name] = etype + else: + try: + translation_set = translations[rtype] + except KeyError: + pass # If no translation found, leave the relation type as is + else: + # Only one possible translation, no ambiguity + if len(translation_set) == 1: + relation.r_type = iter(translations[rtype]).next() + # More than 1 possible translation => resolve it later + else: + ambiguous_nodes[relation] = (lhs.name, translation_set) + if ambiguous_nodes: + resolve_ambiguities(vartypes, ambiguous_nodes, schema) + + +def resolve_ambiguities(var_types, ambiguous_nodes, schema): + """Tries to resolve remaining ambiguities for translation + /!\ An ambiguity is when two different string can be localized with + the same string + A simple example: + - 'name' in a company context will be localized as 'nom' in French + - but ... 'surname' will also be localized as 'nom' + + :type var_types: dict + :param var_types: a map : var_name / var_type + + :type ambiguous_nodes: dict + :param ambiguous_nodes: a map : relation_node / (var_name, available_translations) + + :type schema: `cubicweb.schema.Schema` + :param schema: the application's schema + """ + # Now, try to resolve ambiguous translations + for relation, (var_name, translations_found) in ambiguous_nodes.items(): + try: + vartype = var_types[var_name] + except KeyError: + continue + # Get schema for this entity type + eschema = schema.eschema(vartype) + rtype = _get_approriate_translation(translations_found, eschema) + if rtype is None: + continue + relation.r_type = rtype + + + +QUOTED_SRE = re.compile(r'(.*?)(["\'])(.+?)\2') + +TRANSLATION_MAPS = {} +def trmap(config, schema, lang): + try: + return TRANSLATION_MAPS[lang] + except KeyError: + assert lang in config.translations, '%s %s' % (lang, config.translations) + tr = config.translations[lang] + langmap = {} + for etype in schema.entities(): + etype = str(etype) + langmap[tr(etype).capitalize()] = etype + langmap[etype.capitalize()] = etype + for rtype in schema.relations(): + rtype = str(rtype) + langmap.setdefault(tr(rtype).lower(), set()).add(rtype) + langmap.setdefault(rtype, set()).add(rtype) + TRANSLATION_MAPS[lang] = langmap + return langmap + + +class BaseQueryProcessor(Component): + __abstract__ = True + id = 'magicsearch_processor' + # set something if you want explicit component search facility for the + # component + name = None + + def process_query(self, uquery, req): + args = self.preprocess_query(uquery, req) + try: + return req.execute(*args) + finally: + # rollback necessary to avoid leaving the connection in a bad state + req.cnx.rollback() + + def preprocess_query(self, uquery, req): + raise NotImplementedError() + + + + +class DoNotPreprocess(BaseQueryProcessor): + """this one returns the raw query and should be placed in first position + of the chain + """ + name = 'rql' + priority = 0 + def preprocess_query(self, uquery, req): + return uquery, + + +class QueryTranslator(BaseQueryProcessor): + """ parses through rql and translates into schema language entity names + and attributes + """ + priority = 2 + def preprocess_query(self, uquery, req): + try: + rqlst = parse(uquery, print_errors=False) + except (RQLSyntaxError, BadRQLQuery), err: + return uquery, + schema = self.vreg.schema + # rql syntax tree will be modified in place if necessary + translate_rql_tree(rqlst, trmap(self.config, schema, req.lang), schema) + return rqlst.as_string(), + + +class QSPreProcessor(BaseQueryProcessor): + """Quick search preprocessor + + preprocessing query in shortcut form to their RQL form + """ + priority = 4 + + def preprocess_query(self, uquery, req): + """""" + args = None + self.req = req + try: + # Process as if there was a quoted part + args = self._quoted_words_query(uquery) + ## No quoted part + except BadRQLQuery: + words = uquery.split() + if len(words) == 1: + args = self._one_word_query(*words) + elif len(words) == 2: + args = self._two_words_query(*words) + elif len(words) == 3: + args = self._three_words_query(*words) + else: + args = self._multiple_words_query(words) + return args + + def _get_entity_type(self, word): + """check if the given word is matching an entity type, return it if + it's the case or raise BadRQLQuery if not + """ + etype = word.capitalize() + try: + return trmap(self.config, self.vreg.schema, self.req.lang)[etype] + except KeyError: + raise BadRQLQuery('%s is not a valid entity name' % etype) + + def _get_attribute_name(self, word, eschema): + """check if the given word is matching an attribute of the given entity type, + return it normalized if found or return it untransformed else + """ + """Returns the attributes's name as stored in the DB""" + # Need to convert from unicode to string (could be whatever) + rtype = word.lower() + # Find the entity name as stored in the DB + translations = trmap(self.config, self.vreg.schema, self.req.lang) + try: + translations = translations[rtype] + except KeyError: + raise BadRQLQuery('%s is not a valid attribute for %s entity type' + % (word, eschema)) + rtype = _get_approriate_translation(translations, eschema) + if rtype is None: + raise BadRQLQuery('%s is not a valid attribute for %s entity type' + % (word, eschema)) + return rtype + + def _one_word_query(self, word): + """Specific process for one word query (case (1) of preprocess_rql) + """ + # if this is an integer, then directly go to eid + try: + eid = int(word) + return 'Any X WHERE X eid %(x)s', {'x': eid}, 'x' + except ValueError: + etype = self._get_entity_type(word) + return '%s %s' % (etype, etype[0]), + + def _complete_rql(self, searchstr, etype, rtype=None, var=None, searchattr=None): + searchop = '' + if '%' in searchstr: + if rtype: + possible_etypes = self.schema.rschema(rtype).objects(etype) + else: + possible_etypes = [self.schema.eschema(etype)] + if searchattr or len(possible_etypes) == 1: + searchattr = searchattr or possible_etypes[0].main_attribute() + searchop = 'LIKE ' + searchattr = searchattr or 'has_text' + if var is None: + var = etype[0] + return '%s %s %s%%(text)s' % (var, searchattr, searchop) + + def _two_words_query(self, word1, word2): + """Specific process for two words query (case (2) of preprocess_rql) + """ + etype = self._get_entity_type(word1) + # this is a valid RQL query : ("Person X", or "Person TMP1") + if len(word2) == 1 and word2.isupper(): + return '%s %s' % (etype, word2), + # else, suppose it's a shortcut like : Person Smith + rql = '%s %s WHERE %s' % (etype, etype[0], self._complete_rql(word2, etype)) + return rql, {'text': word2} + + def _three_words_query(self, word1, word2, word3): + """Specific process for three words query (case (3) of preprocess_rql) + """ + etype = self._get_entity_type(word1) + eschema = self.schema.eschema(etype) + rtype = self._get_attribute_name(word2, eschema) + # expand shortcut if rtype is a non final relation + if not self.schema.rschema(rtype).is_final(): + return self._expand_shortcut(etype, rtype, word3) + if '%' in word3: + searchop = 'LIKE ' + else: + searchop = '' + rql = '%s %s WHERE %s' % (etype, etype[0], + self._complete_rql(word3, etype, searchattr=rtype)) + return rql, {'text': word3} + + def _multiple_words_query(self, words): + """specific process for more than 3 words query""" + return ' '.join(words), + + + def _expand_shortcut(self, etype, rtype, searchstr): + """Expands shortcut queries on a non final relation to use has_text or + the main attribute (according to possible entity type) if '%' is used in the + search word + + Transforms : 'person worksat IBM' into + 'Personne P WHERE P worksAt C, C has_text "IBM"' + """ + # check out all possilbe entity types for the relation represented + # by 'rtype' + mainvar = etype[0] + searchvar = mainvar + '1' + rql = '%s %s WHERE %s %s %s, %s' % (etype, mainvar, # Person P + mainvar, rtype, searchvar, # P worksAt C + self._complete_rql(searchstr, etype, + rtype=rtype, var=searchvar)) + return rql, {'text': searchstr} + + + def _quoted_words_query(self, ori_rql): + """Specific process when there's a "quoted" part + """ + m = QUOTED_SRE.match(ori_rql) + # if there's no quoted part, then no special pre-processing to do + if m is None: + raise BadRQLQuery("unable to handle request %r" % ori_rql) + left_words = m.group(1).split() + quoted_part = m.group(3) + # Case (1) : Company "My own company" + if len(left_words) == 1: + try: + word1 = left_words[0] + return self._two_words_query(word1, quoted_part) + except BadRQLQuery, error: + raise BadRQLQuery("unable to handle request %r" % ori_rql) + # Case (2) : Company name "My own company"; + elif len(left_words) == 2: + word1, word2 = left_words + return self._three_words_query(word1, word2, quoted_part) + # return ori_rql + raise BadRQLQuery("unable to handle request %r" % ori_rql) + + + +class FullTextTranslator(BaseQueryProcessor): + priority = 10 + name = 'text' + + def preprocess_query(self, uquery, req): + """suppose it's a plain text query""" + return 'Any X WHERE X has_text %(text)s', {'text': uquery} + + + +class MagicSearchComponent(SingletonComponent): + id = 'magicsearch' + def __init__(self, req, rset=None): + super(MagicSearchComponent, self).__init__(req, rset) + processors = [] + self.by_name = {} + for processorcls in self.vreg.registry_objects('components', + 'magicsearch_processor'): + # instantiation needed + processor = processorcls() + processors.append(processor) + if processor.name is not None: + assert not processor.name in self.by_name + self.by_name[processor.name.lower()] = processor + self.processors = sorted(processors, key=lambda x: x.priority) + + def process_query(self, uquery, req): + assert isinstance(uquery, unicode) + try: + procname, query = uquery.split(':', 1) + proc = self.by_name[procname.strip().lower()] + uquery = query.strip() + except: + # use processor chain + unauthorized = None + for proc in self.processors: + try: + return proc.process_query(uquery, req) + # FIXME : we don't want to catch any exception type here ! + except (RQLSyntaxError, BadRQLQuery): + pass + except Unauthorized, ex: + unauthorized = ex + continue + except Exception, ex: + LOGGER.debug('%s: %s', ex.__class__.__name__, ex) + continue + if unauthorized: + raise unauthorized + else: + # let exception propagate + return proc.process_query(uquery, req) + raise BadRQLQuery(req._('sorry, the server is unable to handle this query')) + + +# Do not make a strong dependency on NlpTools +try: + from NlpTools.rqltools.client import RQLClient +except ImportError: + LOGGER.info('could not import RQLClient (NlpTools)') +else: + try: + from Pyro.errors import NamingError + except ImportError: + LOGGER.warning("pyro is not installed, can't try to connect to nlp server") + else: + try: + class NLPProcessor(BaseQueryProcessor): + priority = 8 + nlp_agent = RQLClient('ivan') + def preprocess_query(self, uquery, req): + try: + answer = self.nlp_agent.get_translation(uquery) + if not answer: + raise BadRQLQuery(uquery) + return answer or uquery, + except Exception, ex: + LOGGER.exception(str(ex)) + return uquery, + + except NamingError: # NlpTools available but no server registered + LOGGER.warning('could not find any RQLServer object named "ivan"') +