perf: adds a cache to know if obsmarkers might affect a revision
authorPierre-Yves David <pierre-yves.david@ens-lyon.org>
Mon, 01 May 2017 08:13:24 +0200
changeset 2296 d6584ce58030
parent 2295 017b971ba28f
child 2297 32cdcf493567
perf: adds a cache to know if obsmarkers might affect a revision Phase information still needs to be thrown in the mix to compute the final information, but skipping reading the obsstore for most operation is a large win. Usage of this cache arrives in the next changeset.
hgext3rd/evolve/obscache.py
--- a/hgext3rd/evolve/obscache.py	Mon May 01 08:07:05 2017 +0200
+++ b/hgext3rd/evolve/obscache.py	Mon May 01 08:13:24 2017 +0200
@@ -7,6 +7,18 @@
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.
 
+import hashlib
+import struct
+import weakref
+import errno
+
+from mercurial import (
+    localrepo,
+    obsolete,
+    phases,
+    node,
+)
+
 from . import (
     exthelper,
 )
@@ -27,6 +39,7 @@
 @eh.wrapfunction(obsstorefilecache, 'func')
 def obsstorewithcache(orig, repo):
     obsstore = orig(repo)
+    obsstore.obscache = obscache(repo.unfiltered())
 
     class cachekeyobsstore(obsstore.__class__):
 
@@ -138,3 +151,171 @@
         startidx = keyobslength
 
     return True, startrev, startidx
+
+class obscache(object):
+    """cache the "does a rev" is the precursors of some obsmarkers data
+
+    This is not directly holding the "is this revision obsolete" information,
+    because phases data gets into play here. However, it allow to compute the
+    "obsolescence" set without reading the obsstore content.
+
+    Implementation note #1:
+
+      The obsstore is implementing only half of the transaction logic it
+      should. It properly record the starting point of the obsstore to allow
+      clean rollback. However it still write to the obsstore file directly
+      during the transaction. Instead it should be keeping data in memory and
+      write to a '.pending' file to make the data vailable for hooks.
+
+      This cache is not going futher than what the obstore is doing, so it does
+      not has any '.pending' logic. When the obsstore gains proper '.pending'
+      support, adding it to this cache should not be too hard. As the flag
+      always move from 0 to 1, we could have a second '.pending' cache file to
+      be read. If flag is set in any of them, the value is 1. For the same
+      reason, updating the file in place should be possible.
+
+    Implementation note #2:
+
+      Instead of having a large final update run, we could update this cache at
+      the level adding a new changeset or a new obsmarkers. More on this in the
+      'update code'.
+
+    Implementation note #3:
+
+        Storage-wise, we could have a "start rev" to avoid storing useless
+        zero. That would be especially useful for the '.pending' overlay.
+    """
+
+    _filepath = 'cache/evoext-obscache-00'
+    _headerformat = '>I20sQQ20s'
+
+    def __init__(self, repo):
+        self._vfs = repo.vfs
+        # The cache key parts are"
+        # - tip-rev,
+        # - tip-node,
+        # - obsstore-length (nb markers),
+        # - obsstore-file-size (in bytes),
+        # - obsstore "cache key"
+        self._cachekey = None
+        self._data = bytearray()
+
+    def get(self, rev):
+        """return True if "rev" is used as "precursors for any obsmarkers
+
+        Make sure the cache has been updated to match the repository content before using it"""
+        return self._data[rev]
+
+    def clear(self):
+        """invalidate the cache content"""
+        self._cachekey = None
+        self._data = bytearray()
+
+    def update(self, repo):
+        """Iteratively update the cache with new repository data"""
+        # If we do not have any data, try loading from disk
+        if self._cachekey is None:
+            self.load(repo)
+
+        valid, startrev, startidx = upgradeneeded(repo, self._cachekey)
+        if not valid:
+            self.clear()
+
+        if startrev is None and startidx is None:
+            return
+
+        # process the new changesets
+        cl = repo.changelog
+        if startrev is not None:
+            node = cl.node
+            # Note:
+            #
+            #  Newly added changeset might be affected by obsolescence markers
+            #  we already have locally. So we needs to have soem global
+            #  knowledge about the markers to handle that question. Right this
+            #  requires parsing all markers in the obsstore. However, we could
+            #  imagine using various optimisation (eg: bloom filter, other on
+            #  disk cache) to remove this full parsing.
+            #
+            #  For now we stick to the simpler approach or paying the
+            #  performance cost on new changesets.
+            succs = repo.obsstore.successors
+            for r in cl.revs(startrev):
+                if node(r) in succs:
+                    val = 1
+                else:
+                    val = 0
+                self._data.append(val)
+        assert len(self._data) == len(cl), (len(self._data), len(cl))
+
+        # process the new obsmarkers
+        if startidx is not None:
+            rev = cl.nodemap.get
+            markers = repo.obsstore._all
+            # Note:
+            #
+            #   There are no actually needs to load the full obsstore here,
+            #   since we only read the latest ones.  We do it for simplicity in
+            #   the first implementation. Loading the full obsstore has a
+            #   performance cost and should go away in this case too. We have
+            #   two simples options for that:
+            #
+            #   1) provide and API to start reading markers from a byte offset
+            #      (we have that data in the cache key)
+            #
+            #   2) directly update the cache at a lower level, in the code
+            #      responsible for adding a markers.
+            #
+            #   Option 2 is probably a bit more invasive, but more solid on the long run
+
+            for i in xrange(startidx, len(repo.obsstore)):
+                r = rev(markers[i][0])
+                # If markers affect a newly added nodes, it would have been
+                # caught in the previous loop, (so we skip < startrev)
+                if r is not None and (startrev is None or r < startrev):
+                    self._data[r] = 1
+
+        # XXX note that there are a race condition here, since the repo "might"
+        # have changed side the cache update above. However, this code will
+        # mostly be running in a lock so we ignore the issue for now.
+        #
+        # To work around this, 'upgradeneeded' should return a bounded amount
+        # of changeset and markers to read with their associated cachekey. see
+        # 'upgradeneeded' for detail.
+        self._cachekey = getcachekey(repo)
+
+    def save(self, repo):
+        """save the data to disk"""
+
+        # XXX it happens that the obsstore is (buggilly) always up to date on disk
+        if self._cachekey is None:
+            return
+
+        with repo.vfs(self._filepath, 'w', atomictemp=True) as cachefile:
+            headerdata = struct.pack(self._headerformat, *self._cachekey)
+            cachefile.write(headerdata)
+            cachefile.write(self._data)
+
+    def load(self, repo):
+        """load data from disk"""
+        assert repo.filtername is None
+
+        data = repo.vfs.tryread(self._filepath)
+        if not data:
+            return
+
+        headersize = struct.calcsize(self._headerformat)
+        self._cachekey = struct.unpack(self._headerformat, data[:headersize])
+        self._data = bytearray(data[headersize:])
+
+@eh.reposetup
+def setupcache(ui, repo):
+
+    class obscacherepo(repo.__class__):
+
+        @localrepo.unfilteredmethod
+        def destroyed(self):
+            if 'obsstore' in vars(self):
+                self.obsstore.obscache.clear()
+
+    repo.__class__ = obscacherepo