server/checkintegrity.py
author Sandrine Ribeau <sandrine.ribeau@logilab.fr>
Wed, 05 Nov 2008 11:22:05 -0800
changeset 1 88d637274072
parent 0 b97547f5f1fa
child 380 06e7f2932afe
permissions -rw-r--r--
Add modules to list and to index to have docstring integrated into the documentation. Needs to be completed. Sphinx required to explicitely list which modules we want to include in the documentation.

"""Check integrity of a CubicWeb repository. Hum actually only the system database
is checked.

:organization: Logilab
:copyright: 2001-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
"""
__docformat__ = "restructuredtext en"

import sys

from mx.DateTime import now
from logilab.common.shellutils import ProgressBar

def has_eid(sqlcursor, eid, eids):
    """return true if the eid is a valid eid"""
    if eids.has_key(eid):
        return eids[eid]
    sqlcursor.execute('SELECT type, source FROM entities WHERE eid=%s' % eid)
    try:
        etype, source = sqlcursor.fetchone()
    except:
        eids[eid] = False
        return False
    if source and source != 'system':
        # XXX what to do...
        eids[eid] = True
        return True
    sqlcursor.execute('SELECT * FROM %s WHERE eid=%s' % (etype, eid))
    result = sqlcursor.fetchall()
    if len(result) == 0:
        eids[eid] = False
        return False
    elif len(result) > 1:
        msg = '  More than one entity with eid %s exists in source !'
        print >> sys.stderr, msg % eid
        print >> sys.stderr, '  WARNING : Unable to fix this, do it yourself !'
    eids[eid] = True
    return True

# XXX move to yams?
def etype_fti_containers(eschema, _done=None):
    if _done is None:
        _done = set()
    _done.add(eschema)
    containers = tuple(eschema.fulltext_containers())
    if containers:
        for rschema, target in containers:
            if target == 'object':
                targets = rschema.objects(eschema)
            else:
                targets = rschema.subjects(eschema)
            for targeteschema in targets:
                if targeteschema in _done:
                    continue
                _done.add(targeteschema)
                for container in etype_fti_containers(targeteschema, _done):
                    yield container
    else:
        yield eschema
    
def reindex_entities(schema, session):
    """reindex all entities in the repository"""
    # deactivate modification_date hook since we don't want them
    # to be updated due to the reindexation
    from cubicweb.server.hooks import (setmtime_before_update_entity,
                                       uniquecstrcheck_before_modification)
    from cubicweb.server.repository import FTIndexEntityOp
    repo = session.repo
    repo.hm.unregister_hook(setmtime_before_update_entity,
                            'before_update_entity', '')
    repo.hm.unregister_hook(uniquecstrcheck_before_modification,
                            'before_update_entity', '')
    etypes = set()
    for eschema in schema.entities():
        if eschema.is_final():
            continue
        indexable_attrs = tuple(eschema.indexable_attributes()) # generator
        if not indexable_attrs:
            continue
        for container in etype_fti_containers(eschema):
            etypes.add(container)
    print 'Reindexing entities of type %s' % \
          ', '.join(sorted(str(e) for e in etypes))
    pb = ProgressBar(len(etypes) + 1)
    # first monkey patch Entity.check to disable validation
    from cubicweb.common.entity import Entity
    _check = Entity.check
    Entity.check = lambda self, creation=False: True
    # clear fti table first
    session.system_sql('DELETE FROM %s' % session.repo.system_source.dbhelper.fti_table)
    pb.update()
    # reindex entities by generating rql queries which set all indexable
    # attribute to their current value
    for eschema in etypes:
        for entity in session.execute('Any X WHERE X is %s' % eschema).entities():
            FTIndexEntityOp(session, entity=entity)
        pb.update()
    # restore Entity.check
    Entity.check = _check

    
def check_schema(session):
    """check serialized schema"""
    print 'Checking serialized schema'
    unique_constraints = ('SizeConstraint', 'FormatConstraint',
                          'VocabularyConstraint', 'RQLConstraint',
                          'RQLVocabularyConstraint')
    rql = ('Any COUNT(X),RN,EN,ECTN GROUPBY RN,EN,ECTN ORDERBY 1 '
           'WHERE X is Econstraint, R constrained_by X, '
           'R relation_type RT, R from_entity ET, RT name RN, '
           'ET name EN, X cstrtype ECT, ECT name ECTN')
    for count, rn, en, cstrname in session.execute(rql):
        if count == 1:
            continue
        if cstrname in unique_constraints:
            print "ERROR: got %s %r constraints on relation %s.%s" % (
                count, cstrname, en, rn)


    
def check_text_index(schema, session, eids, fix=1):
    """check all entities registered in the text index"""
    print 'Checking text index'
    cursor = session.system_sql('SELECT uid FROM appears;')
    for row in cursor.fetchall():
        eid = row[0]
        if not has_eid(cursor, eid, eids):
            msg = '  Entity with eid %s exists in the text index but in no source'
            print >> sys.stderr, msg % eid,
            if fix:
                session.system_sql('DELETE FROM appears WHERE uid=%s;' % eid)
                print >> sys.stderr, ' [FIXED]'
            else:
                print >> sys.stderr


def check_entities(schema, session, eids, fix=1):
    """check all entities registered in the repo system table"""
    print 'Checking entities system table'
    cursor = session.system_sql('SELECT eid FROM entities;')
    for row in cursor.fetchall():
        eid = row[0]
        if not has_eid(cursor, eid, eids):
            msg = '  Entity with eid %s exists in the system table but in no source'
            print >> sys.stderr, msg % eid,
            if fix:
                session.system_sql('DELETE FROM entities WHERE eid=%s;' % eid)
                print >> sys.stderr, ' [FIXED]'
            else:
                print >> sys.stderr
    print 'Checking entities tables'
    for eschema in schema.entities():
        if eschema.is_final():
            continue
        cursor = session.system_sql('SELECT eid FROM %s;' % eschema.type)
        for row in cursor.fetchall():
            eid = row[0]
            # eids is full since we have fetched everyting from the entities table,
            # no need to call has_eid
            if not eid in eids or not eids[eid]:
                msg = '  Entity with eid %s exists in the %s table but not in the system table'
                print >> sys.stderr, msg % (eid, eschema.type),
                if fix:
                    session.system_sql('DELETE FROM %s WHERE eid=%s;' % (eschema.type, eid))
                    print >> sys.stderr, ' [FIXED]'
                else:
                    print >> sys.stderr
                
            
def bad_related_msg(rtype, target, eid, fix):
    msg = '  A relation %s with %s eid %s exists but no such entity in sources'
    print >> sys.stderr, msg % (rtype, target, eid),
    if fix:
        print >> sys.stderr, ' [FIXED]'
    else:
        print >> sys.stderr
    
    
def check_relations(schema, session, eids, fix=1):
    """check all relations registered in the repo system table"""
    print 'Checking relations'
    for rschema in schema.relations():
        if rschema.is_final():
            continue
        rtype = rschema.type
        if rtype == 'identity':
            continue
        if rschema.inlined:
            for subjtype in rschema.subjects():
                cursor = session.system_sql('SELECT %s FROM %s WHERE %s IS NOT NULL;'
                                            % (rtype, subjtype, rtype))
                for row in cursor.fetchall():
                    eid = row[0]
                    if not has_eid(cursor, eid, eids):
                        bad_related_msg(rtype, 'object', eid, fix)
                        if fix:
                            session.system_sql('UPDATE %s SET %s = NULL WHERE eid=%s;'
                                               % (subjtype, rtype, eid))
            continue
        cursor = session.system_sql('SELECT eid_from FROM %s_relation;' % rtype)
        for row in cursor.fetchall():
            eid = row[0]
            if not has_eid(cursor, eid, eids):
                bad_related_msg(rtype, 'subject', eid, fix)
                if fix:
                    session.system_sql(
                        'DELETE FROM %s_relations WHERE eid_from=%s;' % (rtype, eid))
        cursor = session.system_sql('SELECT eid_to FROM %s_relation;' % rtype)
        for row in cursor.fetchall():
            eid = row[0]
            if not has_eid(cursor, eid, eids):
                bad_related_msg(rtype, 'object', eid, fix)
                if fix:
                    session.system_sql('DELETE FROM relations WHERE eid_to=%s;' % eid)


def check_metadata(schema, session, eids, fix=1):
    """check entities has required metadata

    FIXME: rewrite using RQL queries ?
    """
    print 'Checking metadata'
    cursor = session.system_sql("SELECT DISTINCT type FROM entities;")
    for etype, in cursor.fetchall():
        for rel, default in ( ('creation_date', now()),
                              ('modification_date', now()), ):
            cursor = session.system_sql("SELECT eid FROM %s "
                                        "WHERE %s is NULL" % (etype, rel))
            for eid, in cursor.fetchall():
                msg = '  %s with eid %s has no %s'
                print >> sys.stderr, msg % (etype, eid, rel),
                if fix:
                    session.system_sql("UPDATE %s SET %s=%(default)s WHERE eid=%s ;"
                                       % (etype, rel, eid), {'default': default})
                    print >> sys.stderr, ' [FIXED]'
                else:
                    print >> sys.stderr
    cursor = session.system_sql('SELECT MIN(eid) FROM euser;')
    default_user_eid = cursor.fetchone()[0]
    assert default_user_eid is not None, 'no user defined !'
    for rel, default in ( ('owned_by', default_user_eid), ):
        cursor = session.system_sql("SELECT eid, type FROM entities "
                                    "WHERE NOT EXISTS "
                                    "(SELECT 1 FROM %s_relation WHERE eid_from=eid);"
                                    % rel)
        for eid, etype in cursor.fetchall():
            msg = '  %s with eid %s has no %s relation'
            print >> sys.stderr, msg % (etype, eid, rel),
            if fix:
                session.system_sql('INSERT INTO %s_relation VALUES (%s, %s) ;'
                                   % (rel, eid, default))
                print >> sys.stderr, ' [FIXED]'
            else:
                print >> sys.stderr


def check(repo, cnx, checks, reindex, fix):
    """check integrity of application's repository,
    using given user and password to locally connect to the repository
    (no running cubicweb server needed)
    """
    session = repo._get_session(cnx.sessionid, setpool=True)
    # yo, launch checks
    if checks:
        eids_cache = {}
        for check in checks:
            check_func = globals()['check_%s' % check]
            check_func(repo.schema, session, eids_cache, fix=fix)
        if fix:
            cnx.commit()
        else:
            print
        if not fix:
            print 'WARNING: Diagnostic run, nothing has been corrected'
    if reindex:
        cnx.rollback()
        session.set_pool()
        reindex_entities(repo.schema, session)
        cnx.commit()