server/checkintegrity.py
author Adrien Di Mascio <Adrien.DiMascio@logilab.fr>
Tue, 10 Mar 2009 09:39:23 +0100
changeset 1022 15d3e1b3a27d
parent 381 e51deabc9b6a
child 713 5adb6d8e5fa7
child 1161 936c311010fc
permissions -rw-r--r--
copyright / license update

"""Check integrity of a CubicWeb repository. Hum actually only the system database
is checked.

:organization: Logilab
:copyright: 2001-2008 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
:contact: http://www.logilab.fr/ -- mailto:contact@logilab.fr
"""
__docformat__ = "restructuredtext en"

import sys

from mx.DateTime import now
from logilab.common.shellutils import ProgressBar

def has_eid(sqlcursor, eid, eids):
    """return true if the eid is a valid eid"""
    if eids.has_key(eid):
        return eids[eid]
    sqlcursor.execute('SELECT type, source FROM entities WHERE eid=%s' % eid)
    try:
        etype, source = sqlcursor.fetchone()
    except:
        eids[eid] = False
        return False
    if source and source != 'system':
        # XXX what to do...
        eids[eid] = True
        return True
    sqlcursor.execute('SELECT * FROM %s WHERE eid=%s' % (etype, eid))
    result = sqlcursor.fetchall()
    if len(result) == 0:
        eids[eid] = False
        return False
    elif len(result) > 1:
        msg = '  More than one entity with eid %s exists in source !'
        print >> sys.stderr, msg % eid
        print >> sys.stderr, '  WARNING : Unable to fix this, do it yourself !'
    eids[eid] = True
    return True

# XXX move to yams?
def etype_fti_containers(eschema, _done=None):
    if _done is None:
        _done = set()
    _done.add(eschema)
    containers = tuple(eschema.fulltext_containers())
    if containers:
        for rschema, target in containers:
            if target == 'object':
                targets = rschema.objects(eschema)
            else:
                targets = rschema.subjects(eschema)
            for targeteschema in targets:
                if targeteschema in _done:
                    continue
                _done.add(targeteschema)
                for container in etype_fti_containers(targeteschema, _done):
                    yield container
    else:
        yield eschema
    
def reindex_entities(schema, session):
    """reindex all entities in the repository"""
    # deactivate modification_date hook since we don't want them
    # to be updated due to the reindexation
    from cubicweb.server.hooks import (setmtime_before_update_entity,
                                       uniquecstrcheck_before_modification)
    from cubicweb.server.repository import FTIndexEntityOp
    repo = session.repo
    repo.hm.unregister_hook(setmtime_before_update_entity,
                            'before_update_entity', '')
    repo.hm.unregister_hook(uniquecstrcheck_before_modification,
                            'before_update_entity', '')
    etypes = set()
    for eschema in schema.entities():
        if eschema.is_final():
            continue
        indexable_attrs = tuple(eschema.indexable_attributes()) # generator
        if not indexable_attrs:
            continue
        for container in etype_fti_containers(eschema):
            etypes.add(container)
    print 'Reindexing entities of type %s' % \
          ', '.join(sorted(str(e) for e in etypes))
    pb = ProgressBar(len(etypes) + 1)
    # first monkey patch Entity.check to disable validation
    from cubicweb.common.entity import Entity
    _check = Entity.check
    Entity.check = lambda self, creation=False: True
    # clear fti table first
    session.system_sql('DELETE FROM %s' % session.repo.system_source.dbhelper.fti_table)
    pb.update()
    # reindex entities by generating rql queries which set all indexable
    # attribute to their current value
    for eschema in etypes:
        for entity in session.execute('Any X WHERE X is %s' % eschema).entities():
            FTIndexEntityOp(session, entity=entity)
        pb.update()
    # restore Entity.check
    Entity.check = _check

    
def check_schema(schema, session, eids, fix=1):
    """check serialized schema"""
    print 'Checking serialized schema'
    unique_constraints = ('SizeConstraint', 'FormatConstraint',
                          'VocabularyConstraint', 'RQLConstraint',
                          'RQLVocabularyConstraint')
    rql = ('Any COUNT(X),RN,EN,ECTN GROUPBY RN,EN,ECTN ORDERBY 1 '
           'WHERE X is EConstraint, R constrained_by X, '
           'R relation_type RT, R from_entity ET, RT name RN, '
           'ET name EN, X cstrtype ECT, ECT name ECTN')
    for count, rn, en, cstrname in session.execute(rql):
        if count == 1:
            continue
        if cstrname in unique_constraints:
            print "ERROR: got %s %r constraints on relation %s.%s" % (
                count, cstrname, en, rn)


    
def check_text_index(schema, session, eids, fix=1):
    """check all entities registered in the text index"""
    print 'Checking text index'
    cursor = session.system_sql('SELECT uid FROM appears;')
    for row in cursor.fetchall():
        eid = row[0]
        if not has_eid(cursor, eid, eids):
            msg = '  Entity with eid %s exists in the text index but in no source'
            print >> sys.stderr, msg % eid,
            if fix:
                session.system_sql('DELETE FROM appears WHERE uid=%s;' % eid)
                print >> sys.stderr, ' [FIXED]'
            else:
                print >> sys.stderr


def check_entities(schema, session, eids, fix=1):
    """check all entities registered in the repo system table"""
    print 'Checking entities system table'
    cursor = session.system_sql('SELECT eid FROM entities;')
    for row in cursor.fetchall():
        eid = row[0]
        if not has_eid(cursor, eid, eids):
            msg = '  Entity with eid %s exists in the system table but in no source'
            print >> sys.stderr, msg % eid,
            if fix:
                session.system_sql('DELETE FROM entities WHERE eid=%s;' % eid)
                print >> sys.stderr, ' [FIXED]'
            else:
                print >> sys.stderr
    print 'Checking entities tables'
    for eschema in schema.entities():
        if eschema.is_final():
            continue
        cursor = session.system_sql('SELECT eid FROM %s;' % eschema.type)
        for row in cursor.fetchall():
            eid = row[0]
            # eids is full since we have fetched everyting from the entities table,
            # no need to call has_eid
            if not eid in eids or not eids[eid]:
                msg = '  Entity with eid %s exists in the %s table but not in the system table'
                print >> sys.stderr, msg % (eid, eschema.type),
                if fix:
                    session.system_sql('DELETE FROM %s WHERE eid=%s;' % (eschema.type, eid))
                    print >> sys.stderr, ' [FIXED]'
                else:
                    print >> sys.stderr
                
            
def bad_related_msg(rtype, target, eid, fix):
    msg = '  A relation %s with %s eid %s exists but no such entity in sources'
    print >> sys.stderr, msg % (rtype, target, eid),
    if fix:
        print >> sys.stderr, ' [FIXED]'
    else:
        print >> sys.stderr
    
    
def check_relations(schema, session, eids, fix=1):
    """check all relations registered in the repo system table"""
    print 'Checking relations'
    for rschema in schema.relations():
        if rschema.is_final():
            continue
        rtype = rschema.type
        if rtype == 'identity':
            continue
        if rschema.inlined:
            for subjtype in rschema.subjects():
                sql = 'SELECT %s FROM %s WHERE %s IS NOT NULL;' % (
                    rtype, subjtype, rtype)
                cursor = session.system_sql(sql)
                for row in cursor.fetchall():
                    eid = row[0]
                    if not has_eid(cursor, eid, eids):
                        bad_related_msg(rtype, 'object', eid, fix)
                        if fix:
                            sql = 'UPDATE %s SET %s = NULL WHERE eid=%s;' % (
                                subjtype, rtype, eid)
                            session.system_sql(sql)
            continue
        cursor = session.system_sql('SELECT eid_from FROM %s_relation;' % rtype)
        for row in cursor.fetchall():
            eid = row[0]
            if not has_eid(cursor, eid, eids):
                bad_related_msg(rtype, 'subject', eid, fix)
                if fix:
                    sql = 'DELETE FROM %s_relation WHERE eid_from=%s;' % (
                        rtype, eid)
                    session.system_sql(sql)
        cursor = session.system_sql('SELECT eid_to FROM %s_relation;' % rtype)
        for row in cursor.fetchall():
            eid = row[0]
            if not has_eid(cursor, eid, eids):
                bad_related_msg(rtype, 'object', eid, fix)
                if fix:
                    sql = 'DELETE FROM %s_relation WHERE eid_to=%s;' % (
                        rtype, eid)
                    session.system_sql(sql)


def check_metadata(schema, session, eids, fix=1):
    """check entities has required metadata

    FIXME: rewrite using RQL queries ?
    """
    print 'Checking metadata'
    cursor = session.system_sql("SELECT DISTINCT type FROM entities;")
    for etype, in cursor.fetchall():
        for rel, default in ( ('creation_date', now()),
                              ('modification_date', now()), ):
            cursor = session.system_sql("SELECT eid FROM %s "
                                        "WHERE %s is NULL" % (etype, rel))
            for eid, in cursor.fetchall():
                msg = '  %s with eid %s has no %s'
                print >> sys.stderr, msg % (etype, eid, rel),
                if fix:
                    session.system_sql("UPDATE %s SET %s=%(default)s WHERE eid=%s ;"
                                       % (etype, rel, eid), {'default': default})
                    print >> sys.stderr, ' [FIXED]'
                else:
                    print >> sys.stderr
    cursor = session.system_sql('SELECT MIN(eid) FROM euser;')
    default_user_eid = cursor.fetchone()[0]
    assert default_user_eid is not None, 'no user defined !'
    for rel, default in ( ('owned_by', default_user_eid), ):
        cursor = session.system_sql("SELECT eid, type FROM entities "
                                    "WHERE NOT EXISTS "
                                    "(SELECT 1 FROM %s_relation WHERE eid_from=eid);"
                                    % rel)
        for eid, etype in cursor.fetchall():
            msg = '  %s with eid %s has no %s relation'
            print >> sys.stderr, msg % (etype, eid, rel),
            if fix:
                session.system_sql('INSERT INTO %s_relation VALUES (%s, %s) ;'
                                   % (rel, eid, default))
                print >> sys.stderr, ' [FIXED]'
            else:
                print >> sys.stderr


def check(repo, cnx, checks, reindex, fix):
    """check integrity of application's repository,
    using given user and password to locally connect to the repository
    (no running cubicweb server needed)
    """
    session = repo._get_session(cnx.sessionid, setpool=True)
    # yo, launch checks
    if checks:
        eids_cache = {}
        for check in checks:
            check_func = globals()['check_%s' % check]
            check_func(repo.schema, session, eids_cache, fix=fix)
        if fix:
            cnx.commit()
        else:
            print
        if not fix:
            print 'WARNING: Diagnostic run, nothing has been corrected'
    if reindex:
        cnx.rollback()
        session.set_pool()
        reindex_entities(repo.schema, session)
        cnx.commit()