[fti] add cw_fti_index_rql_limit method
authorJulien Cristau <julien.cristau@logilab.fr>
Thu, 19 Nov 2015 17:50:48 +0100
changeset 10847 ce5403611cbe
parent 10846 d186820c5f7a
child 10848 e905c95ccdfb
[fti] add cw_fti_index_rql_limit method Improves on and deprecate cw_fti_index_rql_queries: execute the rql directly, so that we don't need to compute the set of eids up-front, but can just keep track of the last seen entity. Use the new method in reindex_entities. Keep calling cw_fti_index_rql_queries if it has been redefined (e.g in cubes). Related to #3621392.
doc/book/devrepo/fti.rst
entities/__init__.py
entities/test/unittest_base.py
server/checkintegrity.py
--- a/doc/book/devrepo/fti.rst	Thu Nov 19 16:48:55 2015 +0100
+++ b/doc/book/devrepo/fti.rst	Thu Nov 19 17:50:48 2015 +0100
@@ -94,37 +94,10 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 ``db-rebuild-fti`` will call the
-:meth:`~cubicweb.entities.AnyEntity.cw_fti_index_rql_queries` class
+:meth:`~cubicweb.entities.AnyEntity.cw_fti_index_rql_limit` class
 method on your entity type.
 
-.. automethod:: cubicweb.entities.AnyEntity.cw_fti_index_rql_queries
-
-Now, suppose you've got a _huge_ table to index, you probably don't want to
-get all entities at once. So here's a simple customized example that will
-process block of 10000 entities:
-
-.. sourcecode:: python
-
-
-    class MyEntityClass(AnyEntity):
-        __regid__ = 'MyEntityClass'
-
-    @classmethod
-    def cw_fti_index_rql_queries(cls, req):
-        # get the default RQL method and insert LIMIT / OFFSET instructions
-        base_rql = super(SearchIndex, cls).cw_fti_index_rql_queries(req)[0]
-        selected, restrictions = base_rql.split(' WHERE ')
-        rql_template = '%s ORDERBY X LIMIT %%(limit)s OFFSET %%(offset)s WHERE %s' % (
-            selected, restrictions)
-        # count how many entities you'll have to index
-        count = req.execute('Any COUNT(X) WHERE X is MyEntityClass')[0][0]
-        # iterate by blocks of 10000 entities
-        chunksize = 10000
-        for offset in xrange(0, count, chunksize):
-            print 'SENDING', rql_template % {'limit': chunksize, 'offset': offset}
-            yield rql_template % {'limit': chunksize, 'offset': offset}
-
-Since you have access to ``req``, you can more or less fetch whatever you want.
+.. automethod:: cubicweb.entities.AnyEntity.cw_fti_index_rql_limit
 
 
 Customizing :meth:`~cubicweb.entities.adapters.IFTIndexableAdapter.get_words`
--- a/entities/__init__.py	Thu Nov 19 16:48:55 2015 +0100
+++ b/entities/__init__.py	Thu Nov 19 17:50:48 2015 +0100
@@ -19,9 +19,12 @@
 
 __docformat__ = "restructuredtext en"
 
+from warnings import warn
+
 from six import text_type, string_types
 
 from logilab.common.decorators import classproperty
+from logilab.common.deprecation import deprecated
 
 from cubicweb import Unauthorized
 from cubicweb.entity import Entity
@@ -44,6 +47,7 @@
         return req.build_url('add/%s' % cls.__regid__, **kwargs)
 
     @classmethod
+    @deprecated('[3.22] use cw_fti_index_rql_limit instead')
     def cw_fti_index_rql_queries(cls, req):
         """return the list of rql queries to fetch entities to FT-index
 
@@ -61,6 +65,37 @@
         return ['Any %s WHERE %s' % (', '.join(selected),
                                      ', '.join(restrictions))]
 
+    @classmethod
+    def cw_fti_index_rql_limit(cls, req, limit=1000):
+        """generate rsets of entities to FT-index
+
+        By default, each successive result set is limited to 1000 entities
+        """
+        if cls.cw_fti_index_rql_queries.__func__ != AnyEntity.cw_fti_index_rql_queries.__func__:
+            warn("[3.22] cw_fti_index_rql_queries is replaced by cw_fti_index_rql_limit",
+                 DeprecationWarning)
+            for rql in cls.cw_fti_index_rql_queries(req):
+                yield req.execute(rql)
+            return
+        restrictions = ['X is %s' % cls.__regid__]
+        selected = ['X']
+        start = 0
+        for attrschema in sorted(cls.e_schema.indexable_attributes()):
+            varname = attrschema.type.upper()
+            restrictions.append('X %s %s' % (attrschema, varname))
+            selected.append(varname)
+        while True:
+            q_restrictions = restrictions + ['X eid > %s' % start]
+            rset = req.execute('Any %s ORDERBY X LIMIT %s WHERE %s' %
+                               (', '.join(selected),
+                                limit,
+                                ', '.join(q_restrictions)))
+            if rset:
+                start = rset[-1][0]
+                yield rset
+            else:
+                break
+
     # meta data api ###########################################################
 
     def dc_title(self):
--- a/entities/test/unittest_base.py	Thu Nov 19 16:48:55 2015 +0100
+++ b/entities/test/unittest_base.py	Thu Nov 19 17:50:48 2015 +0100
@@ -21,6 +21,7 @@
 
 from logilab.common.testlib import unittest_main
 from logilab.common.decorators import clear_cache
+from logilab.common.registry import yes
 
 from cubicweb.devtools.testlib import CubicWebTC
 
@@ -64,12 +65,31 @@
                           {'description_format': ('format', 'description')})
 
     def test_fti_rql_method(self):
+        class EmailAddress(AnyEntity):
+            __regid__ = 'EmailAddress'
+            __select__ = AnyEntity.__select__ & yes(2)
+            @classmethod
+            def cw_fti_index_rql_queries(cls, req):
+                return ['EmailAddress Y']
         with self.admin_access.web_request() as req:
+            req.create_entity('EmailAddress', address=u'foo@bar.com')
             eclass = self.vreg['etypes'].etype_class('EmailAddress')
+            # deprecated
             self.assertEqual(['Any X, ADDRESS, ALIAS WHERE X is EmailAddress, '
                               'X address ADDRESS, X alias ALIAS'],
                              eclass.cw_fti_index_rql_queries(req))
 
+            self.assertEqual(['Any X, ADDRESS, ALIAS ORDERBY X LIMIT 1000 WHERE X is EmailAddress, '
+                              'X address ADDRESS, X alias ALIAS, X eid > 0'],
+                             [rset.rql for rset in eclass.cw_fti_index_rql_limit(req)])
+
+            # test backwards compatibility with custom method
+            with self.temporary_appobjects(EmailAddress):
+                self.vreg['etypes'].clear_caches()
+                eclass = self.vreg['etypes'].etype_class('EmailAddress')
+                self.assertEqual(['EmailAddress Y'],
+                                 [rset.rql for rset in eclass.cw_fti_index_rql_limit(req)])
+
 
 class EmailAddressTC(BaseEntityTC):
 
--- a/server/checkintegrity.py	Thu Nov 19 16:48:55 2015 +0100
+++ b/server/checkintegrity.py	Thu Nov 19 17:50:48 2015 +0100
@@ -124,8 +124,7 @@
     source = repo.system_source
     for eschema in etypes:
         etype_class = cnx.vreg['etypes'].etype_class(str(eschema))
-        for fti_rql in etype_class.cw_fti_index_rql_queries(cnx):
-            rset = cnx.execute(fti_rql)
+        for rset in etype_class.cw_fti_index_rql_limit(cnx):
             source.fti_index_entities(cnx, rset.entities())
             # clear entity cache to avoid high memory consumption on big tables
             cnx.drop_entity_cache()