[fti] add cw_fti_index_rql_limit method
Improves on and deprecate cw_fti_index_rql_queries: execute the rql
directly, so that we don't need to compute the set of eids up-front, but
can just keep track of the last seen entity. Use the new method in
reindex_entities.
Keep calling cw_fti_index_rql_queries if it has been redefined (e.g in
cubes).
Related to #3621392.
--- a/doc/book/devrepo/fti.rst Thu Nov 19 16:48:55 2015 +0100
+++ b/doc/book/devrepo/fti.rst Thu Nov 19 17:50:48 2015 +0100
@@ -94,37 +94,10 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``db-rebuild-fti`` will call the
-:meth:`~cubicweb.entities.AnyEntity.cw_fti_index_rql_queries` class
+:meth:`~cubicweb.entities.AnyEntity.cw_fti_index_rql_limit` class
method on your entity type.
-.. automethod:: cubicweb.entities.AnyEntity.cw_fti_index_rql_queries
-
-Now, suppose you've got a _huge_ table to index, you probably don't want to
-get all entities at once. So here's a simple customized example that will
-process block of 10000 entities:
-
-.. sourcecode:: python
-
-
- class MyEntityClass(AnyEntity):
- __regid__ = 'MyEntityClass'
-
- @classmethod
- def cw_fti_index_rql_queries(cls, req):
- # get the default RQL method and insert LIMIT / OFFSET instructions
- base_rql = super(SearchIndex, cls).cw_fti_index_rql_queries(req)[0]
- selected, restrictions = base_rql.split(' WHERE ')
- rql_template = '%s ORDERBY X LIMIT %%(limit)s OFFSET %%(offset)s WHERE %s' % (
- selected, restrictions)
- # count how many entities you'll have to index
- count = req.execute('Any COUNT(X) WHERE X is MyEntityClass')[0][0]
- # iterate by blocks of 10000 entities
- chunksize = 10000
- for offset in xrange(0, count, chunksize):
- print 'SENDING', rql_template % {'limit': chunksize, 'offset': offset}
- yield rql_template % {'limit': chunksize, 'offset': offset}
-
-Since you have access to ``req``, you can more or less fetch whatever you want.
+.. automethod:: cubicweb.entities.AnyEntity.cw_fti_index_rql_limit
Customizing :meth:`~cubicweb.entities.adapters.IFTIndexableAdapter.get_words`
--- a/entities/__init__.py Thu Nov 19 16:48:55 2015 +0100
+++ b/entities/__init__.py Thu Nov 19 17:50:48 2015 +0100
@@ -19,9 +19,12 @@
__docformat__ = "restructuredtext en"
+from warnings import warn
+
from six import text_type, string_types
from logilab.common.decorators import classproperty
+from logilab.common.deprecation import deprecated
from cubicweb import Unauthorized
from cubicweb.entity import Entity
@@ -44,6 +47,7 @@
return req.build_url('add/%s' % cls.__regid__, **kwargs)
@classmethod
+ @deprecated('[3.22] use cw_fti_index_rql_limit instead')
def cw_fti_index_rql_queries(cls, req):
"""return the list of rql queries to fetch entities to FT-index
@@ -61,6 +65,37 @@
return ['Any %s WHERE %s' % (', '.join(selected),
', '.join(restrictions))]
+ @classmethod
+ def cw_fti_index_rql_limit(cls, req, limit=1000):
+ """generate rsets of entities to FT-index
+
+ By default, each successive result set is limited to 1000 entities
+ """
+ if cls.cw_fti_index_rql_queries.__func__ != AnyEntity.cw_fti_index_rql_queries.__func__:
+ warn("[3.22] cw_fti_index_rql_queries is replaced by cw_fti_index_rql_limit",
+ DeprecationWarning)
+ for rql in cls.cw_fti_index_rql_queries(req):
+ yield req.execute(rql)
+ return
+ restrictions = ['X is %s' % cls.__regid__]
+ selected = ['X']
+ start = 0
+ for attrschema in sorted(cls.e_schema.indexable_attributes()):
+ varname = attrschema.type.upper()
+ restrictions.append('X %s %s' % (attrschema, varname))
+ selected.append(varname)
+ while True:
+ q_restrictions = restrictions + ['X eid > %s' % start]
+ rset = req.execute('Any %s ORDERBY X LIMIT %s WHERE %s' %
+ (', '.join(selected),
+ limit,
+ ', '.join(q_restrictions)))
+ if rset:
+ start = rset[-1][0]
+ yield rset
+ else:
+ break
+
# meta data api ###########################################################
def dc_title(self):
--- a/entities/test/unittest_base.py Thu Nov 19 16:48:55 2015 +0100
+++ b/entities/test/unittest_base.py Thu Nov 19 17:50:48 2015 +0100
@@ -21,6 +21,7 @@
from logilab.common.testlib import unittest_main
from logilab.common.decorators import clear_cache
+from logilab.common.registry import yes
from cubicweb.devtools.testlib import CubicWebTC
@@ -64,12 +65,31 @@
{'description_format': ('format', 'description')})
def test_fti_rql_method(self):
+ class EmailAddress(AnyEntity):
+ __regid__ = 'EmailAddress'
+ __select__ = AnyEntity.__select__ & yes(2)
+ @classmethod
+ def cw_fti_index_rql_queries(cls, req):
+ return ['EmailAddress Y']
with self.admin_access.web_request() as req:
+ req.create_entity('EmailAddress', address=u'foo@bar.com')
eclass = self.vreg['etypes'].etype_class('EmailAddress')
+ # deprecated
self.assertEqual(['Any X, ADDRESS, ALIAS WHERE X is EmailAddress, '
'X address ADDRESS, X alias ALIAS'],
eclass.cw_fti_index_rql_queries(req))
+ self.assertEqual(['Any X, ADDRESS, ALIAS ORDERBY X LIMIT 1000 WHERE X is EmailAddress, '
+ 'X address ADDRESS, X alias ALIAS, X eid > 0'],
+ [rset.rql for rset in eclass.cw_fti_index_rql_limit(req)])
+
+ # test backwards compatibility with custom method
+ with self.temporary_appobjects(EmailAddress):
+ self.vreg['etypes'].clear_caches()
+ eclass = self.vreg['etypes'].etype_class('EmailAddress')
+ self.assertEqual(['EmailAddress Y'],
+ [rset.rql for rset in eclass.cw_fti_index_rql_limit(req)])
+
class EmailAddressTC(BaseEntityTC):
--- a/server/checkintegrity.py Thu Nov 19 16:48:55 2015 +0100
+++ b/server/checkintegrity.py Thu Nov 19 17:50:48 2015 +0100
@@ -124,8 +124,7 @@
source = repo.system_source
for eschema in etypes:
etype_class = cnx.vreg['etypes'].etype_class(str(eschema))
- for fti_rql in etype_class.cw_fti_index_rql_queries(cnx):
- rset = cnx.execute(fti_rql)
+ for rset in etype_class.cw_fti_index_rql_limit(cnx):
source.fti_index_entities(cnx, rset.entities())
# clear entity cache to avoid high memory consumption on big tables
cnx.drop_entity_cache()