cubicweb/server/rqlannotation.py
author Nicolas Chauvat <nicolas.chauvat@logilab.fr>
Wed, 13 Mar 2019 10:26:15 +0100
changeset 12885 194e9ae964ed
parent 12567 26744ad37953
permissions -rw-r--r--
[server.rqlannotation] rename SQLGenAnnotator to RQLAnnotator This class is in charge of annotating the RQL syntax tree with various bits of information, like (in)variance, use of full-text-index, etc. It is a needed step before the generation of SQL, but does not touch SQL directly. Hence RQLAnnotator seems a better name.

# copyright 2003-2016 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This file is part of CubicWeb.
#
# CubicWeb is free software: you can redistribute it and/or modify it under the
# terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
"""Functions to add additional annotations on a rql syntax tree to ease later
code generation.
"""

from rql import BadRQLQuery
from rql.nodes import Relation, VariableRef, Constant, Variable, Or
from rql.utils import common_parent


class CantSelectPrincipal(Exception):
    """raised when no 'principal' variable can be found"""


def _select_principal(scope, relations, _sort=lambda x: x):
    """given a list of rqlst relations, select one which will be used to
    represent an invariant variable (e.g. using on extremity of the relation
    instead of the variable's type table
    """
    # _sort argument is there for test
    diffscope_rels = {}
    ored_rels = set()
    diffscope_rels = set()
    for rel, role in _sort(relations):
        # note: only eid and has_text among all final relations may be there
        if rel.r_type in ('eid', 'identity'):
            continue
        if rel.optional is not None and len(relations) > 1:
            if role == 'subject' and rel.optional == 'right':
                continue
            if role == 'object' and rel.optional == 'left':
                continue
        if rel.ored(traverse_scope=True):
            ored_rels.add(rel)
        elif rel.scope is scope:
            return rel
        elif not rel.neged(traverse_scope=True):
            diffscope_rels.add(rel)
    if len(ored_rels) > 1:
        ored_rels_copy = tuple(ored_rels)
        for rel1 in ored_rels_copy:
            for rel2 in ored_rels_copy:
                if rel1 is rel2:
                    continue
                if isinstance(common_parent(rel1, rel2), Or):
                    ored_rels.discard(rel1)
                    ored_rels.discard(rel2)
    for rel in _sort(ored_rels):
        if rel.scope is scope:
            return rel
        diffscope_rels.add(rel)
    # if DISTINCT query, can use variable from a different scope as principal
    # since introduced duplicates will be removed
    if scope.stmt.distinct and diffscope_rels:
        return next(iter(_sort(diffscope_rels)))
    # XXX could use a relation from a different scope if it can't generate
    # duplicates, so we should have to check cardinality
    raise CantSelectPrincipal()


def _select_main_var(relations):
    """given a list of rqlst relations, select one which will be used as main
    relation for the rhs variable
    """
    principal = None
    others = []
    # sort for test predictability
    for rel in sorted(relations, key=lambda x: (x.children[0].name, x.r_type)):
        # only equality relation with a variable as rhs may be principal
        if (rel.operator() not in ('=', 'IS')
                or not isinstance(rel.children[1].children[0], VariableRef)
                or rel.neged(strict=True)):
            continue
        if rel.optional:
            others.append(rel)
            continue
        if rel.scope is rel.stmt:
            return rel
        principal = rel
    if principal is None:
        if others:
            return others[0]
        raise BadRQLQuery('unable to find principal in %s' % ', '.join(
            r.as_string() for r in relations))
    return principal


def set_qdata(getrschema, union, noinvariant):
    """recursive function to set querier data on variables in the syntax tree
    """
    for select in union.children:
        for subquery in select.with_:
            set_qdata(getrschema, subquery.query, noinvariant)
        for var in select.defined_vars.values():
            if var.stinfo['invariant']:
                if var in noinvariant:
                    var._q_invariant = False
                else:
                    var._q_invariant = True
            else:
                var._q_invariant = False


class RQLAnnotator(object):
    """Annotate the RQL abstract syntax tree to inform the SQL generation"""

    def __init__(self, schema):
        self.schema = schema
        self.nfdomain = frozenset(eschema.type for eschema in schema.entities()
                                  if not eschema.final)

    def annotate(self, rqlst):
        """add information to the rql syntax tree to help sources to do their
        job (read sql generation)

        a variable is tagged as invariant if:
        * it is a non final variable
        * it is not used as lhs in any final or inlined relation
        * there is no type restriction on this variable (either explicit in the
          syntax tree or because a solution for this variable has been removed
          due to security filtering)
        """
        # assert rqlst.TYPE == 'select', rqlst
        rqlst.has_text_query = self._annotate_union(rqlst)

    def _annotate_union(self, union):
        has_text_query = False
        for select in union.children:
            if self._annotate_select(select):
                has_text_query = True
        return has_text_query

    def _annotate_select(self, rqlst):
        has_text_query = False
        for subquery in rqlst.with_:
            if self._annotate_union(subquery.query):
                has_text_query = True
        getrschema = self.schema.rschema
        for var in rqlst.defined_vars.values():
            stinfo = var.stinfo
            if stinfo.get('ftirels'):
                has_text_query = True
            if stinfo['attrvar']:
                stinfo['invariant'] = False
                stinfo['principal'] = _select_main_var(stinfo['rhsrelations'])
                continue
            if stinfo['typerel'] is None:
                # those particular queries should be executed using the system
                # entities table unless there is some type restriction
                if not stinfo['relations']:
                    # Any X, Any MAX(X)...
                    stinfo['invariant'] = True
                    stinfo['principal'] = None
                    continue
                if (any(rel for rel in stinfo['relations']
                        if rel.r_type == 'eid' and rel.operator() != '=')
                        and not any(r for r in var.stinfo['relations'] - var.stinfo['rhsrelations']
                                    if r.r_type != 'eid'
                                    and (getrschema(r.r_type).inlined
                                         or getrschema(r.r_type).final))):
                    # Any X WHERE X eid > 2
                    stinfo['invariant'] = True
                    stinfo['principal'] = None
                    continue
            if stinfo['selected'] and var.valuable_references() == 1 + bool(stinfo['constnode']):
                # "Any X", "Any X, Y WHERE X attr Y"
                stinfo['invariant'] = False
                continue
            joins = set()
            invariant = False
            for ref in var.references():
                rel = ref.relation()
                if rel is None or rel.is_types_restriction():
                    continue
                lhs, rhs = rel.get_parts()
                onlhs = ref is lhs
                role = 'subject' if onlhs else 'object'
                if rel.r_type == 'eid':
                    if not (onlhs and len(stinfo['relations']) > 1):
                        break
                    if not stinfo['constnode']:
                        joins.add((rel, role))
                    continue
                elif rel.r_type == 'identity':
                    # identity can't be used as principal, so check other relation are used
                    # XXX explain rhs.operator == '='
                    if rhs.operator != '=' or len(stinfo['relations']) <= 1:
                        break
                    joins.add((rel, role))
                    continue
                rschema = getrschema(rel.r_type)
                if rel.optional:
                    if rel in stinfo.get('optrelations', ()):
                        # optional variable can't be invariant if this is the lhs
                        # variable of an inlined relation
                        if rel not in stinfo['rhsrelations'] and rschema.inlined:
                            break
                    # variable used as main variable of an optional relation can't
                    # be invariant, unless we can use some other relation as
                    # reference for the outer join
                    elif not stinfo['constnode']:
                        break
                    elif len(stinfo['relations']) == 2:
                        if onlhs:
                            ostinfo = rhs.children[0].variable.stinfo
                        else:
                            ostinfo = lhs.variable.stinfo
                        if not (ostinfo.get('optcomparisons')
                                or any(orel for orel in ostinfo['relations']
                                       if orel.optional and orel is not rel)):
                            break
                if rschema.final or (onlhs and rschema.inlined):
                    if rschema.type != 'has_text':
                        # need join anyway if the variable appears in a final or
                        # inlined relation
                        break
                    joins.add((rel, role))
                    continue
                if not stinfo['constnode']:
                    if rschema.inlined and rel.neged(strict=True):
                        # if relation is inlined, can't be invariant if that
                        # variable is used anywhere else.
                        # see 'Any P WHERE NOT N ecrit_par P, N eid 512':
                        # sql for 'NOT N ecrit_par P' is 'N.ecrit_par is NULL' so P
                        # can use N.ecrit_par as principal
                        if (stinfo['selected'] or len(stinfo['relations']) > 1):
                            break
                joins.add((rel, role))
            else:
                # if there is at least one ambigous relation and no other to
                # restrict types, can't be invariant since we need to filter out
                # other types
                if not self.is_ambiguous(var):
                    invariant = True
            stinfo['invariant'] = invariant
            if invariant and joins:
                # remember rqlst/solutions analyze information
                # we have to select a kindof "main" relation which will "extrajoins"
                # the other
                # priority should be given to relation which are not in inner queries
                # (eg exists)
                try:
                    stinfo['principal'] = principal = _select_principal(var.scope, joins)
                    if getrschema(principal.r_type).inlined:
                        # the scope of the lhs variable must be equal or outer to the
                        # rhs variable's scope (since it's retrieved from lhs's table)
                        sstinfo = principal.children[0].variable.stinfo
                        sstinfo['scope'] = common_parent(sstinfo['scope'], stinfo['scope']).scope
                except CantSelectPrincipal:
                    stinfo['invariant'] = False
        # see unittest_rqlannotation. test_has_text_security_cache_bug
        # XXX probably more to do, but yet that work without more...
        for col_alias in rqlst.aliases.values():
            if col_alias.stinfo.get('ftirels'):
                has_text_query = True
        return has_text_query

    def is_ambiguous(self, var):
        # ignore has_text relation when we know it will be used as principal.
        # This is expected by the rql2sql generator which will use the `entities`
        # table to filter out by type if necessary, This optimisation is very
        # interesting in multi-sources cases, as it may avoid a costly query
        # on sources to get all entities of a given type to achieve this, while
        # we have all the necessary information.
        root = var.stmt.root  # Union node
        # rel.scope -> Select or Exists node, so add .parent to get Union from
        # Select node
        rels = [rel for rel in var.stinfo['relations'] if rel.scope.parent is root]
        if len(rels) == 1 and rels[0].r_type == 'has_text':
            return False
        try:
            data = var.stmt._deamb_data
        except AttributeError:
            data = var.stmt._deamb_data = IsAmbData(self.schema, self.nfdomain)
            data.compute(var.stmt)
        return data.is_ambiguous(var)


class IsAmbData(object):
    def __init__(self, schema, nfdomain):
        self.schema = schema
        # shortcuts
        self.rschema = schema.rschema
        self.eschema = schema.eschema
        # domain for non final variables
        self.nfdomain = nfdomain
        # {var: possible solutions set}
        self.varsols = {}
        # set of ambiguous variables
        self.ambiguousvars = set()
        # remember if a variable has been deambiguified by another to avoid
        # doing the opposite
        self.deambification_map = {}
        # not invariant variables (access to final.inlined relation)
        self.not_invariants = set()

    def is_ambiguous(self, var):
        return var in self.ambiguousvars

    def restrict(self, var, restricted_domain):
        self.varsols[var] &= restricted_domain
        if var in self.ambiguousvars and self.varsols[var] == var.stinfo['possibletypes']:
            self.ambiguousvars.remove(var)

    def compute(self, rqlst):
        # set domains for each variable
        for varname, var in rqlst.defined_vars.items():
            if (var.stinfo['uidrel'] is not None
                    or self.eschema(rqlst.solutions[0][varname]).final):
                ptypes = var.stinfo['possibletypes']
            else:
                ptypes = set(self.nfdomain)
                self.ambiguousvars.add(var)
            self.varsols[var] = ptypes
        if not self.ambiguousvars:
            return
        # apply relation restriction
        self.maydeambrels = maydeambrels = {}
        for rel in rqlst.iget_nodes(Relation):
            if rel.r_type == 'eid' or rel.is_types_restriction():
                continue
            lhs, rhs = rel.get_variable_parts()
            if isinstance(lhs, VariableRef) or isinstance(rhs, VariableRef):
                rschema = self.rschema(rel.r_type)
                if rschema.inlined or rschema.final:
                    self.not_invariants.add(lhs.variable)
                self.set_rel_constraint(lhs, rel, rschema.subjects)
                self.set_rel_constraint(rhs, rel, rschema.objects)
        # try to deambiguify more variables by considering other variables'type
        modified = True
        while modified and self.ambiguousvars:
            modified = False
            for var in self.ambiguousvars.copy():
                try:
                    for rel in (var.stinfo['relations'] & maydeambrels[var]):
                        if self.deambiguifying_relation(var, rel):
                            modified = True
                            break
                except KeyError:
                    # no relation to deambiguify
                    continue

    def _debug_print(self):
        print('varsols', dict((x, sorted(str(v) for v in values))
                              for x, values in self.varsols.items()))
        print('ambiguous vars', sorted(self.ambiguousvars))

    def set_rel_constraint(self, term, rel, etypes_func):
        if isinstance(term, VariableRef) and self.is_ambiguous(term.variable):
            var = term.variable
            if (len(var.stinfo['relations']) == 1
                    or rel.scope is var.scope
                    or rel.r_type == 'identity'):
                self.restrict(var, frozenset(etypes_func()))
                try:
                    self.maydeambrels[var].add(rel)
                except KeyError:
                    self.maydeambrels[var] = set((rel,))

    def deambiguifying_relation(self, var, rel):
        lhs, rhs = rel.get_variable_parts()
        onlhs = var is getattr(lhs, 'variable', None)
        other = onlhs and rhs or lhs
        otheretypes = None
        # XXX isinstance(other.variable, Variable) to skip column alias
        if isinstance(other, VariableRef) and isinstance(other.variable, Variable):
            deambiguifier = other.variable
            if var is not self.deambification_map.get(deambiguifier):
                if var.stinfo['typerel'] is None:
                    otheretypes = deambiguifier.stinfo['possibletypes']
                elif not self.is_ambiguous(deambiguifier):
                    otheretypes = self.varsols[deambiguifier]
                elif deambiguifier in self.not_invariants:
                    # we know variable won't be invariant, try to use
                    # it to deambguify the current variable
                    otheretypes = self.varsols[deambiguifier]
            if deambiguifier.stinfo['typerel'] is None:
                # if deambiguifier has no type restriction using 'is',
                # don't record it
                deambiguifier = None
        elif isinstance(other, Constant) and other.uidtype:
            otheretypes = (other.uidtype,)
            deambiguifier = None
        if otheretypes is not None:
            # to restrict, we must check that for all type in othertypes,
            # possible types on the other end of the relation are matching
            # variable's possible types
            rschema = self.rschema(rel.r_type)
            if onlhs:
                rtypefunc = rschema.subjects
            else:
                rtypefunc = rschema.objects
            for otheretype in otheretypes:
                reltypes = frozenset(rtypefunc(otheretype))
                if var.stinfo['possibletypes'] != reltypes:
                    return False
            self.restrict(var, var.stinfo['possibletypes'])
            self.deambification_map[var] = deambiguifier
            return True
        return False