Patchwork [experimental] manifest: persist the manifestfulltext cache

login
register
mail settings
Submitter Martijn Pieters
Date July 16, 2018, 7:27 p.m.
Message ID <6b76d41520cded0655a6.1531769224@Martijns-MacBook-Pro.local>
Download mbox | patch
Permalink /patch/32874/
State New
Headers show

Comments

Martijn Pieters - July 16, 2018, 7:27 p.m.
# HG changeset patch
# User Martijn Pieters <mj@zopatista.com>
# Date 1531769057 -3600
#      Mon Jul 16 20:24:17 2018 +0100
# Node ID 6b76d41520cded0655a6214fdd515014a23b6b9c
# Parent  2a227782e75426a3e4408273a9a4eebe79dfdcea
# EXP-Topic persistentmanifestcache
manifest: persist the manifestfulltext cache

Reconstructing the manifest from the revlog takes time, so much so that there
already is a LRU cache to avoid having to load a manifest multiple times.

This patch persists that LRU cache in the .hg/cache directory, so we can re-use
this cache across hg commands. Commit benchmark (run on Macos 10.13 on a
2017-model Macbook Pro with Core i7 2.9GHz and flash drive), testing without
and with patch run 5 times, baseline is r2a227782e754:

* committing to an existing file, against the mozilla-central repository.
  Baseline real time average 1.9692, with patch 1.3786.

A new debugcommand "hg debugmanifestfulltextcache" lets you inspect the cache,
clear it, or add specific manifest nodeids to it. When calling
repo.updatecaches(), the manifest(s) for the working copy parents are added to
the cache.

Patch

diff -r 2a227782e754 -r 6b76d41520cd mercurial/debugcommands.py
--- a/mercurial/debugcommands.py	Wed Jul 11 16:44:33 2018 -0700
+++ b/mercurial/debugcommands.py	Mon Jul 16 20:24:17 2018 +0100
@@ -1446,6 +1446,53 @@ 
 
     return held
 
+@command('debugmanifestfulltextcache', [
+        ('', 'clear', False, _('clear the cache')),
+        ('a', 'add', '', _('add the given manifest node to the cache'),
+         _('NODE'))
+    ], '')
+def debugmanifestfulltextcache(ui, repo, add=None, **opts):
+    """show, clear or amend the contents of the manifest fulltext cache"""
+    with repo.lock():
+        r = repo.manifestlog._revlog
+        try:
+            cache = r._fulltextcache
+        except AttributeError:
+            ui.warn(_(
+                "Current revlog implementation doesn't appear to have a "
+                'manifest fulltext cache\n'))
+            return
+
+        if opts.get(r'clear'):
+            cache.clear()
+
+        if add:
+            try:
+                manifest = repo.manifestlog[r.lookup(add)]
+            except error.LookupError as e:
+                raise error.Abort(e, hint="Check your manifest node id")
+            manifest.read()  # stores revisision in cache too
+
+        if not len(cache):
+            ui.write(_('Cache empty'))
+        else:
+            ui.write(
+                _('Cache contains %d manifest entries, in order of most to '
+                  'least recent:\n') % (len(cache),))
+            totalsize = 0
+            for nodeid in cache:
+                # Use cache.get to not update the LRU order
+                data = cache.get(nodeid)
+                size = len(data)
+                totalsize += size + 24   # 20 bytes nodeid, 4 bytes size
+                ui.write(_('id: %s, size %s\n') % (
+                    hex(nodeid), util.bytecount(size)))
+            ondisk = cache._opener.stat('manifestfulltextcache').st_size
+            ui.write(
+                _('Total cache data size %s, on-disk %s\n') % (
+                    util.bytecount(totalsize), util.bytecount(ondisk))
+            )
+
 @command('debugmergestate', [], '')
 def debugmergestate(ui, repo, *args):
     """print merge state
diff -r 2a227782e754 -r 6b76d41520cd mercurial/localrepo.py
--- a/mercurial/localrepo.py	Wed Jul 11 16:44:33 2018 -0700
+++ b/mercurial/localrepo.py	Mon Jul 16 20:24:17 2018 +0100
@@ -1599,6 +1599,10 @@ 
                 rbc.branchinfo(r)
             rbc.write()
 
+            # ensure the working copy parents are in the manifestfulltextcache
+            for ctx in self['.'].parents():
+                ctx.manifest()  # accessing the manifest is enough
+
     def invalidatecaches(self):
 
         if '_tagscache' in vars(self):
diff -r 2a227782e754 -r 6b76d41520cd mercurial/manifest.py
--- a/mercurial/manifest.py	Wed Jul 11 16:44:33 2018 -0700
+++ b/mercurial/manifest.py	Mon Jul 16 20:24:17 2018 +0100
@@ -10,6 +10,7 @@ 
 import heapq
 import itertools
 import struct
+import weakref
 
 from .i18n import _
 from .node import (
@@ -1136,6 +1137,114 @@ 
             for subtree in subm.walksubtrees(matcher=matcher):
                 yield subtree
 
+class manifestfulltextcache(util.lrucachedict):
+    """File-backed LRU cache for the manifest cache
+
+    File consists of entries, up to EOF:
+
+    - 20 bytes node, 4 bytes length, <length> manifest data
+
+    These are written in reverse cache order (oldest to newest).
+
+    """
+    def __init__(self, max):
+        super(manifestfulltextcache, self).__init__(max)
+        self._dirty = False
+        self._read = False
+        self._opener = None
+
+    def read(self):
+        if self._read or self._opener is None:
+            return
+
+        try:
+            with self._opener('manifestfulltextcache') as fp:
+                set = super(manifestfulltextcache, self).__setitem__
+                # ignore trailing data, this is a cache, corruption is skipped
+                while True:
+                    node = fp.read(20)
+                    if len(node) < 20:
+                        break
+                    try:
+                        size = struct.unpack('>L', fp.read(4))[0]
+                    except struct.error:
+                        break
+                    value = bytearray(fp.read(size))
+                    if len(value) != size:
+                        break
+                    set(node, value)
+        except IOError:
+            # the file is allowed to be missing
+            pass
+
+        self._read = True
+        self._dirty = False
+
+    def write(self):
+        if not self._dirty or self._opener is None:
+            return
+        # rotate backwards to the first used node
+        with self._opener(
+                'manifestfulltextcache', 'w', atomictemp=True, checkambig=True
+            ) as fp:
+            node = self._head.prev
+            while True:
+                if node.key in self._cache:
+                    fp.write(node.key)
+                    fp.write(struct.pack('>L', len(node.value)))
+                    fp.write(node.value)
+                if node is self._head:
+                    break
+                node = node.prev
+
+    def __len__(self):
+        if not self._read:
+            self.read()
+        return super(manifestfulltextcache, self).__len__()
+
+    def __contains__(self, k):
+        if not self._read:
+            self.read()
+        return super(manifestfulltextcache, self).__contains__(k)
+
+    def __iter__(self):
+        if not self._read:
+            self.read()
+        return super(manifestfulltextcache, self).__iter__()
+
+    def __getitem__(self, k):
+        if not self._read:
+            self.read()
+        # the cache lru order can change on read
+        setdirty = self._cache.get(k) is not self._head
+        value = super(manifestfulltextcache, self).__getitem__(k)
+        if setdirty:
+            self._dirty = True
+        return value
+
+    def __setitem__(self, k, v):
+        if not self._read:
+            self.read()
+        super(manifestfulltextcache, self).__setitem__(k, v)
+        self._dirty = True
+
+    def __delitem__(self, k):
+        if not self._read:
+            self.read()
+        super(manifestfulltextcache, self).__delitem__(k)
+        self._dirty = True
+
+    def get(self, k, default=None):
+        if not self._read:
+            self.read()
+        return super(manifestfulltextcache, self).get(k, default=default)
+
+    def clear(self):
+        if not self._read:
+            self.read()
+        super(manifestfulltextcache, self).clear()
+        self._dirty = True
+
 class manifestrevlog(revlog.revlog):
     '''A revlog that stores manifest texts. This is responsible for caching the
     full-text manifest contents.
@@ -1164,7 +1273,7 @@ 
 
         self._treeondisk = optiontreemanifest or treemanifest
 
-        self._fulltextcache = util.lrucachedict(cachesize)
+        self._fulltextcache = manifestfulltextcache(cachesize)
 
         if dir:
             assert self._treeondisk, 'opts is %r' % opts
@@ -1186,6 +1295,28 @@ 
                                              checkambig=not bool(dir),
                                              mmaplargeindex=True)
 
+    def _setupmanifestcachehooks(self, repo):
+        """Persist the manifestfulltextcache on lock release"""
+        if not util.safehasattr(repo, '_lockref'):
+            return
+
+        self._fulltextcache._opener = repo.cachevfs
+        reporef = weakref.ref(repo)
+        manifestrevlogref = weakref.ref(self)
+
+        def persistmanifestcache():
+            repo = reporef()
+            self = manifestrevlogref()
+            if repo is None or self is None:
+                return
+            if repo.manifestlog._revlog is not self:
+                # there's a different manifest in play now, abort
+                return
+            self._fulltextcache.write()
+
+        if repo._currentlock(repo._lockref) is not None:
+            repo._afterlock(persistmanifestcache)
+
     @property
     def fulltextcache(self):
         return self._fulltextcache
@@ -1288,6 +1419,7 @@ 
         self._treeinmem = usetreemanifest
 
         self._revlog = repo._constructmanifest()
+        self._revlog._setupmanifestcachehooks(repo)
         self._narrowmatch = repo.narrowmatch()
 
         # A cache of the manifestctx or treemanifestctx for each directory
@@ -1421,9 +1553,12 @@ 
                 self._data = manifestdict()
             else:
                 rl = self._revlog()
-                text = rl.revision(self._node)
-                arraytext = bytearray(text)
-                rl._fulltextcache[self._node] = arraytext
+                if self._node in rl._fulltextcache:
+                    text = pycompat.bytestr(rl._fulltextcache[self._node])
+                else:
+                    text = rl.revision(self._node)
+                    arraytext = bytearray(text)
+                    rl._fulltextcache[self._node] = arraytext
                 self._data = manifestdict(text)
         return self._data
 
@@ -1523,9 +1658,12 @@ 
                 m.setnode(self._node)
                 self._data = m
             else:
-                text = rl.revision(self._node)
-                arraytext = bytearray(text)
-                rl.fulltextcache[self._node] = arraytext
+                if self._node in rl.fulltextcache:
+                    text = pycompat.bytestr(rl.fulltextcache[self._node])
+                else:
+                    text = rl.revision(self._node)
+                    arraytext = bytearray(text)
+                    rl.fulltextcache[self._node] = arraytext
                 self._data = treemanifest(dir=self._dir, text=text)
 
         return self._data
diff -r 2a227782e754 -r 6b76d41520cd tests/test-clone.t
--- a/tests/test-clone.t	Wed Jul 11 16:44:33 2018 -0700
+++ b/tests/test-clone.t	Mon Jul 16 20:24:17 2018 +0100
@@ -47,6 +47,7 @@ 
   checklink (symlink !)
   checklink-target (symlink !)
   checknoexec (execbit !)
+  manifestfulltextcache
   rbc-names-v1
   rbc-revs-v1
 
diff -r 2a227782e754 -r 6b76d41520cd tests/test-completion.t
--- a/tests/test-completion.t	Wed Jul 11 16:44:33 2018 -0700
+++ b/tests/test-completion.t	Mon Jul 16 20:24:17 2018 +0100
@@ -98,6 +98,7 @@ 
   debugknown
   debuglabelcomplete
   debuglocks
+  debugmanifestfulltextcache
   debugmergestate
   debugnamecomplete
   debugobsolete
@@ -284,6 +285,7 @@ 
   debugknown: 
   debuglabelcomplete: 
   debuglocks: force-lock, force-wlock, set-lock, set-wlock
+  debugmanifestfulltextcache: clear, add
   debugmergestate: 
   debugnamecomplete: 
   debugobsolete: flags, record-parents, rev, exclusive, index, delete, date, user, template
diff -r 2a227782e754 -r 6b76d41520cd tests/test-debugcommands.t
--- a/tests/test-debugcommands.t	Wed Jul 11 16:44:33 2018 -0700
+++ b/tests/test-debugcommands.t	Mon Jul 16 20:24:17 2018 +0100
@@ -411,6 +411,7 @@ 
   $ ls -r .hg/cache/*
   .hg/cache/rbc-revs-v1
   .hg/cache/rbc-names-v1
+  .hg/cache/manifestfulltextcache
   .hg/cache/branch2-served
 
 Test debugcolor
diff -r 2a227782e754 -r 6b76d41520cd tests/test-fncache.t
--- a/tests/test-fncache.t	Wed Jul 11 16:44:33 2018 -0700
+++ b/tests/test-fncache.t	Mon Jul 16 20:24:17 2018 +0100
@@ -88,6 +88,7 @@ 
   .hg/00manifest.i
   .hg/cache
   .hg/cache/branch2-served
+  .hg/cache/manifestfulltextcache
   .hg/cache/rbc-names-v1
   .hg/cache/rbc-revs-v1
   .hg/data
@@ -121,6 +122,7 @@ 
   .hg/00changelog.i
   .hg/cache
   .hg/cache/branch2-served
+  .hg/cache/manifestfulltextcache
   .hg/cache/rbc-names-v1
   .hg/cache/rbc-revs-v1
   .hg/dirstate
diff -r 2a227782e754 -r 6b76d41520cd tests/test-hardlinks.t
--- a/tests/test-hardlinks.t	Wed Jul 11 16:44:33 2018 -0700
+++ b/tests/test-hardlinks.t	Mon Jul 16 20:24:17 2018 +0100
@@ -241,6 +241,7 @@ 
   2 r4/.hg/cache/checkisexec (execbit !)
   ? r4/.hg/cache/checklink-target (glob) (symlink !)
   2 r4/.hg/cache/checknoexec (execbit !)
+  2 r4/.hg/cache/manifestfulltextcache
   2 r4/.hg/cache/rbc-names-v1
   2 r4/.hg/cache/rbc-revs-v1
   2 r4/.hg/dirstate
@@ -291,6 +292,7 @@ 
   2 r4/.hg/cache/checkisexec (execbit !)
   2 r4/.hg/cache/checklink-target (symlink !)
   2 r4/.hg/cache/checknoexec (execbit !)
+  2 r4/.hg/cache/manifestfulltextcache
   2 r4/.hg/cache/rbc-names-v1
   2 r4/.hg/cache/rbc-revs-v1
   1 r4/.hg/dirstate
diff -r 2a227782e754 -r 6b76d41520cd tests/test-help.t
--- a/tests/test-help.t	Wed Jul 11 16:44:33 2018 -0700
+++ b/tests/test-help.t	Mon Jul 16 20:24:17 2018 +0100
@@ -966,6 +966,9 @@ 
    debuginstall  test Mercurial installation
    debugknown    test whether node ids are known to a repo
    debuglocks    show or modify state of locks
+   debugmanifestfulltextcache
+                 show, clear or amend the contents of the manifest fulltext
+                 cache
    debugmergestate
                  print merge state
    debugnamecomplete
diff -r 2a227782e754 -r 6b76d41520cd tests/test-inherit-mode.t
--- a/tests/test-inherit-mode.t	Wed Jul 11 16:44:33 2018 -0700
+++ b/tests/test-inherit-mode.t	Mon Jul 16 20:24:17 2018 +0100
@@ -69,6 +69,7 @@ 
   00600 ./.hg/00changelog.i
   00770 ./.hg/cache/
   00660 ./.hg/cache/branch2-served
+  00660 ./.hg/cache/manifestfulltextcache
   00660 ./.hg/cache/rbc-names-v1
   00660 ./.hg/cache/rbc-revs-v1
   00660 ./.hg/dirstate
diff -r 2a227782e754 -r 6b76d41520cd tests/test-share.t
--- a/tests/test-share.t	Wed Jul 11 16:44:33 2018 -0700
+++ b/tests/test-share.t	Mon Jul 16 20:24:17 2018 +0100
@@ -32,6 +32,7 @@ 
   [1]
   $ ls -1 ../repo1/.hg/cache
   branch2-served
+  manifestfulltextcache
   rbc-names-v1
   rbc-revs-v1
   tags2-visible