Patchwork [1,of,4,resend] ancestor: a new algorithm that is faster for nodes near tip

login
register
mail settings
Submitter Bryan O'Sullivan
Date April 16, 2013, 5:09 p.m.
Message ID <2f7186400a072015db1d.1366132163@australite.local>
Download mbox | patch
Permalink /patch/1348/
State Accepted
Commit 2f7186400a072015db1d962adfecaaac5dd40a24
Headers show

Comments

Bryan O'Sullivan - April 16, 2013, 5:09 p.m.
# HG changeset patch
# User Bryan O'Sullivan <bryano@fb.com>
# Date 1366132098 25200
#      Tue Apr 16 10:08:18 2013 -0700
# Node ID 2f7186400a072015db1d962adfecaaac5dd40a24
# Parent  43cb150e74f9107e22345d8af0e7c606f991d3e9
ancestor: a new algorithm that is faster for nodes near tip

Instead of walking all the way to the root of the DAG, we generate
a set of candidate GCA revs, then figure out which ones will win
the race to the root (usually without needing to traverse all the
way to the root).

In the common case of nodes that are close to each other in both
revision number and topology, this is usually a big win: it makes
"hg --time debugancestors" up to 9 times faster than the more general
ancestor function when measured on heads of the linux-2.6 hg repo.

Victory is not assured, however. The older function can still win
by a large margin if one node is much closer to the root than the
other, or by a much smaller amount if one is an ancestor of the
other.

For now, we've also got a small paranoid harness function that calls
both ancestor functions on every input and ensures that they give
equivalent answers.

Even without the checker function, the old ancestor function needs
to stay alive for the time being, as its generality is used by
context.filectx.merge.

Patch

diff --git a/mercurial/ancestor.py b/mercurial/ancestor.py
--- a/mercurial/ancestor.py
+++ b/mercurial/ancestor.py
@@ -5,10 +5,132 @@ 
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.
 
-import heapq, util
+import error, heapq, util
 from node import nullrev
 
-def ancestor(a, b, pfunc):
+def ancestors(pfunc, *orignodes):
+    """
+    Returns the common ancestors of a and b that are furthest from a
+    root (as measured by longest path).
+
+    pfunc must return a list of parent vertices for a given vertex.
+    """
+    if not isinstance(orignodes, set):
+        orignodes = set(orignodes)
+    if nullrev in orignodes:
+        return set()
+    if len(orignodes) <= 1:
+        return orignodes
+
+    def candidates(nodes):
+        allseen = (1 << len(nodes)) - 1
+        seen = [0] * (max(nodes) + 1)
+        for i, n in enumerate(nodes):
+            seen[n] = 1 << i
+        poison = 1 << (i + 1)
+
+        gca = set()
+        interesting = left = len(nodes)
+        nv = len(seen) - 1
+        while nv >= 0 and interesting:
+            v = nv
+            nv -= 1
+            if not seen[v]:
+                continue
+            sv = seen[v]
+            if sv < poison:
+                interesting -= 1
+                if sv == allseen:
+                    gca.add(v)
+                    sv |= poison
+                    if v in nodes:
+                        left -= 1
+                        if left <= 1:
+                            # history is linear
+                            return set([v])
+            if sv < poison:
+                for p in pfunc(v):
+                    sp = seen[p]
+                    if p == nullrev:
+                        continue
+                    if sp == 0:
+                        seen[p] = sv
+                        interesting += 1
+                    elif sp != sv:
+                        seen[p] |= sv
+            else:
+                for p in pfunc(v):
+                    if p == nullrev:
+                        continue
+                    sp = seen[p]
+                    if sp and sp < poison:
+                        interesting -= 1
+                    seen[p] = sv
+        return gca
+
+    def deepest(nodes):
+        interesting = {}
+        count = max(nodes) + 1
+        depth = [0] * count
+        seen = [0] * count
+        mapping = []
+        for (i, n) in enumerate(sorted(nodes)):
+            depth[n] = 1
+            b = 1 << i
+            seen[n] = b
+            interesting[b] = 1
+            mapping.append((b, n))
+        nv = count - 1
+        while nv >= 0 and len(interesting) > 1:
+            v = nv
+            nv -= 1
+            dv = depth[v]
+            if dv == 0:
+                continue
+            sv = seen[v]
+            for p in pfunc(v):
+                if p == nullrev:
+                    continue
+                dp = depth[p]
+                nsp = sp = seen[p]
+                if dp <= dv:
+                    depth[p] = dv + 1
+                    if sp != sv:
+                        interesting[sv] += 1
+                        nsp = seen[p] = sv
+                        if sp:
+                            interesting[sp] -= 1
+                            if interesting[sp] == 0:
+                                del interesting[sp]
+                elif dv == dp - 1:
+                    nsp = sp | sv
+                    if nsp == sp:
+                        continue
+                    seen[p] = nsp
+                    interesting.setdefault(nsp, 0)
+                    interesting[nsp] += 1
+                    interesting[sp] -= 1
+                    if interesting[sp] == 0:
+                        del interesting[sp]
+            interesting[sv] -= 1
+            if interesting[sv] == 0:
+                del interesting[sv]
+
+        if len(interesting) != 1:
+            return []
+
+        k = 0
+        for i in interesting:
+            k |= i
+        return set(n for (i, n) in mapping if k & i)
+
+    gca = candidates(orignodes)
+
+    if len(gca) <= 1:
+        return gca
+    return deepest(gca)
+
+def genericancestor(a, b, pfunc):
     """
     Returns the common ancestor of a and b that is furthest from a
     root (as measured by longest path) or None if no ancestor is
@@ -30,7 +152,7 @@  def ancestor(a, b, pfunc):
     depth = {}
     while visit:
         vertex = visit[-1]
-        pl = pfunc(vertex)
+        pl = [p for p in pfunc(vertex) if p != nullrev]
         parentcache[vertex] = pl
         if not pl:
             depth[vertex] = 0
@@ -91,6 +213,51 @@  def ancestor(a, b, pfunc):
     except StopIteration:
         return None
 
+def finddepths(nodes, pfunc):
+    visit = list(nodes)
+    rootpl = [nullrev, nullrev]
+    depth = {}
+    while visit:
+        vertex = visit[-1]
+        pl = pfunc(vertex)
+        if not pl or pl == rootpl:
+            depth[vertex] = 0
+            visit.pop()
+        else:
+            for p in pl:
+                if p != nullrev and p not in depth:
+                    visit.append(p)
+            if visit[-1] == vertex:
+                dp = [depth[p] for p in pl if p != nullrev]
+                if dp:
+                    depth[vertex] = max(dp) + 1
+                else:
+                    depth[vertex] = 0
+                visit.pop()
+    return depth
+
+def ancestor(a, b, pfunc):
+    xs = ancestors(pfunc, a, b)
+    y = genericancestor(a, b, pfunc)
+    if y == -1:
+        y = None
+    if not xs:
+        if y is None:
+            return None
+        print xs, y
+        raise error.RepoError('ancestors disagree on whether a gca exists')
+    elif y is None:
+        print xs, y
+        raise error.RepoError('ancestors disagree on whether a gca exists')
+    if y in xs:
+        return y
+    xds = finddepths(xs, pfunc)
+    xds = [ds[x] for x in xs]
+    yd = finddepths([y], pfunc)[y]
+    if len([xd != yd for xd in xds]) > 0:
+        raise error.RepoError('ancestor depths do not match')
+    return xs.pop()
+
 def missingancestors(revs, bases, pfunc):
     """Return all the ancestors of revs that are not ancestors of bases.
 
diff --git a/mercurial/context.py b/mercurial/context.py
--- a/mercurial/context.py
+++ b/mercurial/context.py
@@ -756,7 +756,7 @@  class filectx(object):
             return pl
 
         a, b = (self._path, self._filenode), (fc2._path, fc2._filenode)
-        v = ancestor.ancestor(a, b, parents)
+        v = ancestor.genericancestor(a, b, parents)
         if v:
             f, n = v
             return filectx(self._repo, f, fileid=n, filelog=flcache[f])
diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -711,10 +711,7 @@  class revlog(object):
         if self.descendant(start, end):
             return self.node(start)
 
-        def parents(rev):
-            return [p for p in self.parentrevs(rev) if p != nullrev]
-
-        c = ancestor.ancestor(a, b, parents)
+        c = ancestor.ancestor(a, b, self.parentrevs)
         if c is None:
             return nullid