# HG changeset patch # User Julien Cristau # Date 1447951848 -3600 # Node ID ce5403611cbe15531d978fa4af452ad56a179915 # Parent d186820c5f7ac801c17c097c213d667bf78f62ee [fti] add cw_fti_index_rql_limit method Improves on and deprecate cw_fti_index_rql_queries: execute the rql directly, so that we don't need to compute the set of eids up-front, but can just keep track of the last seen entity. Use the new method in reindex_entities. Keep calling cw_fti_index_rql_queries if it has been redefined (e.g in cubes). Related to #3621392. diff -r d186820c5f7a -r ce5403611cbe doc/book/devrepo/fti.rst --- a/doc/book/devrepo/fti.rst Thu Nov 19 16:48:55 2015 +0100 +++ b/doc/book/devrepo/fti.rst Thu Nov 19 17:50:48 2015 +0100 @@ -94,37 +94,10 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``db-rebuild-fti`` will call the -:meth:`~cubicweb.entities.AnyEntity.cw_fti_index_rql_queries` class +:meth:`~cubicweb.entities.AnyEntity.cw_fti_index_rql_limit` class method on your entity type. -.. automethod:: cubicweb.entities.AnyEntity.cw_fti_index_rql_queries - -Now, suppose you've got a _huge_ table to index, you probably don't want to -get all entities at once. So here's a simple customized example that will -process block of 10000 entities: - -.. sourcecode:: python - - - class MyEntityClass(AnyEntity): - __regid__ = 'MyEntityClass' - - @classmethod - def cw_fti_index_rql_queries(cls, req): - # get the default RQL method and insert LIMIT / OFFSET instructions - base_rql = super(SearchIndex, cls).cw_fti_index_rql_queries(req)[0] - selected, restrictions = base_rql.split(' WHERE ') - rql_template = '%s ORDERBY X LIMIT %%(limit)s OFFSET %%(offset)s WHERE %s' % ( - selected, restrictions) - # count how many entities you'll have to index - count = req.execute('Any COUNT(X) WHERE X is MyEntityClass')[0][0] - # iterate by blocks of 10000 entities - chunksize = 10000 - for offset in xrange(0, count, chunksize): - print 'SENDING', rql_template % {'limit': chunksize, 'offset': offset} - yield rql_template % {'limit': chunksize, 'offset': offset} - -Since you have access to ``req``, you can more or less fetch whatever you want. +.. automethod:: cubicweb.entities.AnyEntity.cw_fti_index_rql_limit Customizing :meth:`~cubicweb.entities.adapters.IFTIndexableAdapter.get_words` diff -r d186820c5f7a -r ce5403611cbe entities/__init__.py --- a/entities/__init__.py Thu Nov 19 16:48:55 2015 +0100 +++ b/entities/__init__.py Thu Nov 19 17:50:48 2015 +0100 @@ -19,9 +19,12 @@ __docformat__ = "restructuredtext en" +from warnings import warn + from six import text_type, string_types from logilab.common.decorators import classproperty +from logilab.common.deprecation import deprecated from cubicweb import Unauthorized from cubicweb.entity import Entity @@ -44,6 +47,7 @@ return req.build_url('add/%s' % cls.__regid__, **kwargs) @classmethod + @deprecated('[3.22] use cw_fti_index_rql_limit instead') def cw_fti_index_rql_queries(cls, req): """return the list of rql queries to fetch entities to FT-index @@ -61,6 +65,37 @@ return ['Any %s WHERE %s' % (', '.join(selected), ', '.join(restrictions))] + @classmethod + def cw_fti_index_rql_limit(cls, req, limit=1000): + """generate rsets of entities to FT-index + + By default, each successive result set is limited to 1000 entities + """ + if cls.cw_fti_index_rql_queries.__func__ != AnyEntity.cw_fti_index_rql_queries.__func__: + warn("[3.22] cw_fti_index_rql_queries is replaced by cw_fti_index_rql_limit", + DeprecationWarning) + for rql in cls.cw_fti_index_rql_queries(req): + yield req.execute(rql) + return + restrictions = ['X is %s' % cls.__regid__] + selected = ['X'] + start = 0 + for attrschema in sorted(cls.e_schema.indexable_attributes()): + varname = attrschema.type.upper() + restrictions.append('X %s %s' % (attrschema, varname)) + selected.append(varname) + while True: + q_restrictions = restrictions + ['X eid > %s' % start] + rset = req.execute('Any %s ORDERBY X LIMIT %s WHERE %s' % + (', '.join(selected), + limit, + ', '.join(q_restrictions))) + if rset: + start = rset[-1][0] + yield rset + else: + break + # meta data api ########################################################### def dc_title(self): diff -r d186820c5f7a -r ce5403611cbe entities/test/unittest_base.py --- a/entities/test/unittest_base.py Thu Nov 19 16:48:55 2015 +0100 +++ b/entities/test/unittest_base.py Thu Nov 19 17:50:48 2015 +0100 @@ -21,6 +21,7 @@ from logilab.common.testlib import unittest_main from logilab.common.decorators import clear_cache +from logilab.common.registry import yes from cubicweb.devtools.testlib import CubicWebTC @@ -64,12 +65,31 @@ {'description_format': ('format', 'description')}) def test_fti_rql_method(self): + class EmailAddress(AnyEntity): + __regid__ = 'EmailAddress' + __select__ = AnyEntity.__select__ & yes(2) + @classmethod + def cw_fti_index_rql_queries(cls, req): + return ['EmailAddress Y'] with self.admin_access.web_request() as req: + req.create_entity('EmailAddress', address=u'foo@bar.com') eclass = self.vreg['etypes'].etype_class('EmailAddress') + # deprecated self.assertEqual(['Any X, ADDRESS, ALIAS WHERE X is EmailAddress, ' 'X address ADDRESS, X alias ALIAS'], eclass.cw_fti_index_rql_queries(req)) + self.assertEqual(['Any X, ADDRESS, ALIAS ORDERBY X LIMIT 1000 WHERE X is EmailAddress, ' + 'X address ADDRESS, X alias ALIAS, X eid > 0'], + [rset.rql for rset in eclass.cw_fti_index_rql_limit(req)]) + + # test backwards compatibility with custom method + with self.temporary_appobjects(EmailAddress): + self.vreg['etypes'].clear_caches() + eclass = self.vreg['etypes'].etype_class('EmailAddress') + self.assertEqual(['EmailAddress Y'], + [rset.rql for rset in eclass.cw_fti_index_rql_limit(req)]) + class EmailAddressTC(BaseEntityTC): diff -r d186820c5f7a -r ce5403611cbe server/checkintegrity.py --- a/server/checkintegrity.py Thu Nov 19 16:48:55 2015 +0100 +++ b/server/checkintegrity.py Thu Nov 19 17:50:48 2015 +0100 @@ -124,8 +124,7 @@ source = repo.system_source for eschema in etypes: etype_class = cnx.vreg['etypes'].etype_class(str(eschema)) - for fti_rql in etype_class.cw_fti_index_rql_queries(cnx): - rset = cnx.execute(fti_rql) + for rset in etype_class.cw_fti_index_rql_limit(cnx): source.fti_index_entities(cnx, rset.entities()) # clear entity cache to avoid high memory consumption on big tables cnx.drop_entity_cache()