Patchwork [2,of,3,v3] localrepo: persistent caching of branch names

login
register
mail settings
Submitter Mads Kiilerich
Date Oct. 16, 2014, 1:19 a.m.
Message ID <6514c63b1f0db65e2cf4.1413422387@ssl.google-analytics.com>
Download mbox | patch
Permalink /patch/6309/
State Superseded
Headers show

Comments

Mads Kiilerich - Oct. 16, 2014, 1:19 a.m.
# HG changeset patch
# User Mads Kiilerich <madski@unity3d.com>
# Date 1413422278 -7200
#      Thu Oct 16 03:17:58 2014 +0200
# Node ID 6514c63b1f0db65e2cf410c40f2420bfe8f9ef37
# Parent  1937d5ede058a1d2ad53ac1f9b6b755482fd9d8c
localrepo: persistent caching of branch names

It is expensive to retrieve the branch name. Very expensive when creating a
changectx and calling .branch() - slightly less when using
changelog.branchinfo().

Now, to really speed things up, cache the results on disk. To get efficient
lookup for revisions (constant size records) and avoid storing the same branch
name over and ever, store the name of each branch once with a fixed ordering.
For each repo revision, store the node hash and the index of the branch name.
To make it 100% stable against repository mutations, always check the node hash
before using the cache content.

The code for this is kind of similar to the branchmap handling and is placed in
the same module even though the name is not completely spot on.

This new method promise to make some operations up 20 times faster once it
actually is used.

A simpler approach that didn't store and validate node hashes for every
revision was significantly faster (x2) but could be tricked when modifying
history. The usual worst case would be that the whole cache was invalidated
when the repository history was modified, but when trying very hard it could be
tricked into not noticing changes.

Patch

diff --git a/mercurial/branchmap.py b/mercurial/branchmap.py
--- a/mercurial/branchmap.py
+++ b/mercurial/branchmap.py
@@ -9,6 +9,7 @@  from node import bin, hex, nullid, nullr
 import encoding
 import util
 import time
+import struct, array
 
 def _filename(repo):
     """name of a branch head cache file for a given repo or repoview"""
@@ -285,3 +286,95 @@  class branchheadcache(dict):
         duration = time.time() - starttime
         repo.ui.log('branchheadcache', 'updated %s branch head cache '
                     'in %.4f seconds\n', repo.filtername, duration)
+
+filename = 'cache/branchnames'
+formatversion = 2345164374
+headerfmt = '>LLL' # file header: version, start of records, length of records
+recfmt = '>20sH' # a record: node hash, branch name reference
+headersize = struct.calcsize(headerfmt)
+recsize = struct.calcsize(recfmt)
+
+class revbranchcache(object):
+    """Persistent cache mapping from revision number to branch.
+    Consistency is guaranteed by verifying the node hash."""
+
+    def __init__(self, repo):
+        self._repo = repo
+        self._loaded = False
+        self._dirty = False
+        self._names = [] # branch names referenced from recfmt records
+        self._records = array.array('c') # bytes with structs of type recfmt
+
+    def _load(self):
+        """Load cached branch names."""
+        try:
+            data = self._repo.vfs.open(filename).read()
+        except IOError:
+            data = ''
+
+        self._dirty = True
+        reporecslen = len(self._repo) * recsize
+        if len(data) >= headersize:
+            # header
+            v, recsstart, recslen = struct.unpack_from(headerfmt, data)
+            if v == formatversion and len(data) == recsstart + recslen:
+                # between header and records: \0 separated branch names
+                if recsstart != headersize:
+                    self._names = \
+                        data[headersize:recsstart].split('\0')
+                # read records, cap at repo size
+                self._records.fromstring(
+                    buffer(data, recsstart, min(recslen, reporecslen)))
+                # only dirty if too many records (after strip)
+                self._dirty = recslen > reporecslen
+            else:
+                self._repo.ui.debug('branch cache file was invalid\n')
+
+        # pad to repo size
+        if len(self._records) < reporecslen:
+            self._records.extend(
+                '\xff' * (reporecslen - len(self._records)))
+
+        self._branchnamesindex = dict((b, r)
+                                      for r, b in enumerate(self._names))
+        self._node = self._repo.changelog.node
+        self._branchinfo = self._repo.changelog.branchinfo
+        self._loaded = True
+
+    def branch(self, rev):
+        """Return branch name of rev, using and updating persistent cache."""
+        if not self._loaded:
+            self._load()
+
+        node = self._node(rev)
+        cachenode, branchidx = struct.unpack_from(recfmt, self._records,
+                                                  rev * recsize)
+        if cachenode == node and branchidx < len(self._names):
+            return self._names[branchidx]
+        b, _close = self._branchinfo(rev)
+        if b in self._branchnamesindex:
+            branchidx = self._branchnamesindex[b]
+        else:
+            branchidx = len(self._names)
+            self._names.append(b)
+            self._branchnamesindex[b] = branchidx
+        struct.pack_into(recfmt, self._records, rev * recsize,
+                         node, branchidx)
+        self._dirty = True
+        return b
+
+    def save(self):
+        """Save branch cache if it is dirty."""
+        if self._dirty:
+            self._repo.ui.debug('writing branch cache file\n')
+            try:
+                f = self._repo.vfs.open(filename, 'w', atomictemp=True)
+                s = '\0'.join(self._names)
+                f.write(struct.pack(headerfmt, formatversion,
+                                    headersize + len(s), len(self._records)))
+                f.write(s)
+                f.write(self._records)
+                f.close()
+            except IOError:
+                pass
+            self._dirty = False
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -297,8 +297,10 @@  class localrepository(object):
         # - bookmark changes
         self.filteredrevcache = {}
 
+        self.revbranchcache = branchmap.revbranchcache(self)
+
     def close(self):
-        pass
+        self.revbranchcache.save()
 
     def _restrictcapabilities(self, caps):
         # bundle2 is not ready for prime time, drop it unless explicitly