Patchwork D6259: revset: on-disk cache for children queries

login
register
mail settings
Submitter phabricator
Date April 16, 2019, 10:43 p.m.
Message ID <differential-rev-PHID-DREV-t5ea2f73yhyt7nt2kjzz-req@phab.mercurial-scm.org>
Download mbox | patch
Permalink /patch/39671/
State New
Headers show

Comments

phabricator - April 16, 2019, 10:43 p.m.
joerg.sonnenberger created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  This is a proof of concept for further discussion.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6259

AFFECTED FILES
  mercurial/localrepo.py
  mercurial/revset.py

CHANGE DETAILS




To: joerg.sonnenberger, #hg-reviewers
Cc: mercurial-devel
phabricator - April 17, 2019, 10:54 a.m.
indygreg added a comment.


  Do you have performance numbers to share? Substantial wins would definitely pique my interest :)

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6259

To: joerg.sonnenberger, #hg-reviewers
Cc: indygreg, mercurial-devel
phabricator - April 17, 2019, 12:54 p.m.
joerg.sonnenberger added a comment.


  For the NetBSD repository, a trivial test with the new cache:
  
    time hg log -r '1000~-400' -T {node} > /dev/null
    real	0m1.898s
    time hg log -r '1000~-400' -T {node} > /dev/null
    real	0m0.170s
    time hg log -r '1000~-1' -T {node} > /dev/null
    real	0m0.166s
    time hg log -r '440000~-1' -T {node} > /dev/null
    real	0m0.170s
  
  First one includes the time to initially warmup the cache.
  
  Without the cache:
  
    time hg log -r '1000~0' -T {node} > /dev/null
    real	0m0.196s
    time hg log -r '1000~-1' -T {node} > /dev/null
    real	0m0.825s
    time hg log -r '1000~-2' -T {node} > /dev/null
    real	0m1.288s
    time hg log -r '1000~-400' -T {node} > /dev/null
    real	3m23.201s
    time hg log -r '440000~-1' -T {node} > /dev/null
    real	0m0.182s
  
  In other words, building the cache is amortised by one or two queries for children early up in the tree. The cache still provides a good benefit nearer to tip.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6259

To: joerg.sonnenberger, #hg-reviewers
Cc: indygreg, mercurial-devel
phabricator - April 17, 2019, 1:29 p.m.
martinvonz added a comment.


  Perhaps we'd want this to not be specifically for revsets? We could also write to it when we write to the changelog to keep it up to date.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6259

To: joerg.sonnenberger, #hg-reviewers
Cc: martinvonz, indygreg, mercurial-devel

Patch

diff --git a/mercurial/revset.py b/mercurial/revset.py
--- a/mercurial/revset.py
+++ b/mercurial/revset.py
@@ -385,13 +385,13 @@ 
     cs = set()
     for r in getset(repo, fullreposet(repo), x):
         for i in range(n):
-            c = repo[r].children()
+            c = repo._childrencache.children(r)
             if len(c) == 0:
                 break
             if len(c) > 1:
                 raise error.RepoLookupError(
                     _("revision in set has more than one child"))
-            r = c[0].rev()
+            r = list(c)[0]
         else:
             cs.add(r)
     return subset & cs
@@ -586,18 +586,7 @@ 
 def _children(repo, subset, parentset):
     if not parentset:
         return baseset()
-    cs = set()
-    pr = repo.changelog.parentrevs
-    minrev = parentset.min()
-    nullrev = node.nullrev
-    for r in subset:
-        if r <= minrev:
-            continue
-        p1, p2 = pr(r)
-        if p1 in parentset:
-            cs.add(r)
-        if p2 != nullrev and p2 in parentset:
-            cs.add(r)
+    cs = set.union(*[set(repo._childrencache.children(c)) for c in parentset])
     return baseset(cs)
 
 @predicate('children(set)', safe=True)
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -11,6 +11,7 @@ 
 import hashlib
 import os
 import random
+import struct
 import sys
 import time
 import weakref
@@ -994,6 +995,7 @@ 
 
         self._branchcaches = {}
         self._revbranchcache = None
+        self.__childrencache = None
         self._filterpats = {}
         self._datafilters = {}
         self._transref = self._lockref = self._wlockref = None
@@ -1084,6 +1086,8 @@ 
     def _writecaches(self):
         if self._revbranchcache:
             self._revbranchcache.write()
+        if self.__childrencache:
+            self.__childrencache.write()
 
     def _restrictcapabilities(self, caps):
         if self.ui.configbool('experimental', 'bundle2-advertise'):
@@ -1194,6 +1198,11 @@ 
         return manifest.manifestlog(self.svfs, self, rootstore,
                                     self._storenarrowmatch)
 
+    @unfilteredpropertycache
+    def _childrencache(self):
+        self.__childrencache = childrencache(self)
+        return self.__childrencache
+
     @repofilecache('dirstate')
     def dirstate(self):
         return self._makedirstate()
@@ -2087,6 +2096,8 @@ 
             # ensure the working copy parents are in the manifestfulltextcache
             for ctx in self['.'].parents():
                 ctx.manifest()  # accessing the manifest is enough
+            self._childrencache.clear()
+            self._childrencache.write()
 
     def invalidatecaches(self):
 
@@ -2097,6 +2108,8 @@ 
         self.unfiltered()._branchcaches.clear()
         self.invalidatevolatilesets()
         self._sparsesignaturecache.clear()
+        self._childrencache.clear()
+        self._childrencache.write()
 
     def invalidatevolatilesets(self):
         self.filteredrevcache.clear()
@@ -3087,3 +3100,110 @@ 
     # We may have a repoview, which intercepts __setattr__. So be sure
     # we operate at the lowest level possible.
     object.__setattr__(repo, r'__class__', poisonedrepository)
+
+class childrencache(object):
+    """Persistent cache, mapping from revision number to revision numbers of
+    the direct children. This is a low level cache, independent of filtering.
+    """
+    _filename = 'childrencache'
+    def __init__(self, repo, readonly=False):
+        assert repo.filtername is None
+        self._data = None
+        self._new_data = {}
+        self._read = False
+        self._repo = repo
+        self._readonly = readonly
+
+    def children(self, rev):
+        if not self._read:
+            self.read()
+        children = self._new_data.get(rev, set())
+        rev += 1
+        if rev >= self._lastknown:
+            return children
+        children = children.copy()
+        child_len, child_off = struct.unpack_from('>ll', self._data, 8 * rev)
+        if child_len == 1:
+            children.add(child_off)
+        elif child_len > 1:
+            start = 8 * self._lastknown + child_off * 4
+            if len(self._data) < start + child_len * 4:
+                raise error.Abort('Truncated children cache')
+            for i in xrange(child_len):
+                child, = struct.unpack_from('>l', self._data, start)
+                children.add(child)
+                start += 4
+        return children
+
+    def clear(self):
+        self._read = True
+        self._data = None
+        self._new_data = {}
+        self._lastknown = 0
+        self._update_changes(0)
+
+    def _update_changes(self, start):
+        idx = self._repo.changelog.index
+        for r in xrange(start, len(self._repo)):
+            rev = idx[r]
+            if rev[5] != nullrev:
+                self._new_data.setdefault(rev[5], set()).add(r)
+            if rev[6] != nullrev and rev[5] != rev[6]:
+                self._new_data.setdefault(rev[6], set()).add(r)
+            if rev[5] == nullrev and rev[6] == nullrev:
+                self._new_data.setdefault(nullrev, set()).add(r)
+
+    def read(self):
+        if self._read:
+            return
+        self._read = True
+        try:
+            data = self._repo.cachevfs.read(self._filename)
+        except (OSError, IOError):
+            data = ""
+        if len(data) < 24:
+            self._repo.ui.debug('children cache missing or too short, ignored')
+            self._lastknown = 0
+            self._update_changes(0)
+            return
+        tip, lastrev = struct.unpack_from('>20sl', data, 0)
+        if lastrev == 0:
+            self._repo.ui.debug('children cache empty, ignored')
+            self._lastknown = 0
+            self._update_changes(0)
+            return
+        if lastrev > len(self._repo) + 1 or self._repo.changelog.node(lastrev - 2) != tip:
+            self._lastknown = 0
+            self._update_changes(0)
+            self._repo.ui.debug('children cache does not match current repository state, ignored')
+            return
+        data = data[24:]
+        if lastrev * 8 > len(data):
+            self._lastknown = 0
+            self._update_changes(0)
+            self._repo.ui.debug('children cache was truncated, ignored')
+            return
+        self._data = data
+        self._lastknown = lastrev
+        self._update_changes(lastrev)
+
+    def write(self):
+        if not self._new_data or self._readonly:
+            return
+        lastrev = len(self._repo) + 1
+        data = [struct.pack('>20sl', self._repo.changelog.tip(), lastrev)]
+        data2 = []
+        for r in xrange(lastrev):
+            children = self.children(r - 1)
+            data.append(struct.pack('>l', len(children)))
+            if len(children) == 0:
+                data.append(struct.pack('>l', 0))
+            elif len(children) == 1:
+                children = list(children)
+                data.append(struct.pack('>l', children[0]))
+            else:
+                data.append(struct.pack('>l', len(data2)))
+                for c in children:
+                    data2.append(struct.pack('>l', c))
+        output = b''.join(data) + b''.join(data2)
+        self._repo.cachevfs.write(self._filename, output, atomictemp=True)