# HG changeset patch # User Christophe de Vienne # Date 1394117733 -3600 # Node ID 166c6f7b1be45228413708881ec953e4e695136a # Parent 28b3d39bcbc66026f0c0dd65353af79fce4c58fb [fti] Improve big table reindexation * Slice the reindexing in batches of 1000 entities. * Make the output more verbose. Closes #3621392 diff -r 28b3d39bcbc6 -r 166c6f7b1be4 entities/__init__.py --- a/entities/__init__.py Fri Oct 02 17:28:33 2015 +0200 +++ b/entities/__init__.py Thu Mar 06 15:55:33 2014 +0100 @@ -26,6 +26,12 @@ from cubicweb.entity import Entity +def chunks(seq, step): + """See http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python)""" + return (seq[i:i+step] + for i in xrange(0, len(seq), step)) + + class AnyEntity(Entity): """an entity instance has e_schema automagically set on the class and instances have access to their issuing cursor @@ -44,21 +50,25 @@ @classmethod def cw_fti_index_rql_queries(cls, req): - """return the list of rql queries to fetch entities to FT-index + """return an iterator on rql queries to fetch entities to FT-index - The default is to fetch all entities at once and to prefetch - indexable attributes but one could imagine iterating over + The default is to fetch entities 1000 per 1000 and to prefetch + indexable attributes, but one could imagine iterating over "smaller" resultsets if the table is very big or returning a subset of entities that match some business-logic condition. """ - restrictions = ['X is %s' % cls.__regid__] + restrictions = [] selected = ['X'] for attrschema in sorted(cls.e_schema.indexable_attributes()): varname = attrschema.type.upper() restrictions.append('X %s %s' % (attrschema, varname)) selected.append(varname) - return ['Any %s WHERE %s' % (', '.join(selected), - ', '.join(restrictions))] + rset = req.execute('Any EID WHERE X eid EID, X is %s' % cls.__regid__) + for rows in chunks(rset.rows, 1000): + q_restrictions = restrictions + [ + 'X eid IN (%s)' % ', '.join(str(r[0]) for r in rows)] + yield 'Any %s WHERE %s' % (', '.join(selected), + ', '.join(q_restrictions)) # meta data api ########################################################### diff -r 28b3d39bcbc6 -r 166c6f7b1be4 server/checkintegrity.py --- a/server/checkintegrity.py Fri Oct 02 17:28:33 2015 +0200 +++ b/server/checkintegrity.py Thu Mar 06 15:55:33 2014 +0100 @@ -122,7 +122,10 @@ source = repo.system_source for eschema in etypes: etype_class = cnx.vreg['etypes'].etype_class(str(eschema)) - for fti_rql in etype_class.cw_fti_index_rql_queries(cnx): + queries = list(etype_class.cw_fti_index_rql_queries(cnx)) + for i, fti_rql in enumerate(queries): + if withpb: + pb.text = "%s: %s%%" % (str(eschema), i * 100 / len(queries)) rset = cnx.execute(fti_rql) source.fti_index_entities(cnx, rset.entities()) # clear entity cache to avoid high memory consumption on big tables