Patchwork [7,of,8] changegroup: fix treemanifests on merges

login
register
mail settings
Submitter Martin von Zweigbergk
Date Feb. 23, 2016, 7:52 p.m.
Message ID <1ab4ec0c1cc67b33b069.1456257152@martinvonz.mtv.corp.google.com>
Download mbox | patch
Permalink /patch/13330/
State Accepted
Headers show

Comments

Martin von Zweigbergk - Feb. 23, 2016, 7:52 p.m.
# HG changeset patch
# User Martin von Zweigbergk <martinvonz@google.com>
# Date 1455347349 28800
#      Fri Feb 12 23:09:09 2016 -0800
# Node ID 1ab4ec0c1cc67b33b069e41668bcf57215978df9
# Parent  54d19d993e13b9edabbabc518a99c47d75c85f76
changegroup: fix treemanifests on merges

The current code for generating treemanifest revisions takes the list
of files in the changeset and finds the directories from them. This
does not work for merges, since a merge may pick file A from one side
and file B from another and neither of them would appear in the
changeset's "files" list, but the manifest would still change.

Fix this by instead walking the root manifest log for all needed
revisions, storing all needed file and subdirectory revisions, then
recursively visiting the subdirectories. This also turns out to be
faster: cloning a version of hg core converted to treemanifests went
from ~28s to ~19s (timing somewhat unfair: before this patch, timed
until crash; after this patch, timed until manifests complete).

The new algorithm is used only on treemanifest repos. Although it
works equally well on flat manifests, we leave the iteration over
files in the changeset for flat manifests for now.

Patch

diff -r 54d19d993e13 -r 1ab4ec0c1cc6 mercurial/changegroup.py
--- a/mercurial/changegroup.py	Fri Feb 12 23:30:18 2016 -0800
+++ b/mercurial/changegroup.py	Fri Feb 12 23:09:09 2016 -0800
@@ -553,27 +553,6 @@ 
             return d
         return readexactly(self._fh, n)
 
-def _moddirs(files):
-    """Given a set of modified files, find the list of modified directories.
-
-    This returns a list of (path to changed dir, changed dir) tuples,
-    as that's what the one client needs anyway.
-
-    >>> _moddirs(['a/b/c.py', 'a/b/c.txt', 'a/d/e/f/g.txt', 'i.txt', ])
-    [('/', 'a/'), ('a/', 'b/'), ('a/', 'd/'), ('a/d/', 'e/'), ('a/d/e/', 'f/')]
-
-    """
-    alldirs = set()
-    for f in files:
-        path = f.split('/')[:-1]
-        for i in xrange(len(path) - 1, -1, -1):
-            dn = '/'.join(path[:i])
-            current = dn + '/', path[i] + '/'
-            if current in alldirs:
-                break
-            alldirs.add(current)
-    return sorted(alldirs)
-
 class cg1packer(object):
     deltaheader = _CHANGEGROUPV1_DELTA_HEADER
     version = '01'
@@ -755,7 +734,7 @@ 
     def generatemanifests(self, commonrevs, clrevorder, fastpathlinkrev, mfs,
                           mfchangedfiles, fnodes):
         repo = self._repo
-        ml = repo.manifest
+        dirlog = repo.manifest.dirlog
         tmfnodes = {'': mfs}
 
         # Callback for the manifest, used to collect linkrevs for filelog
@@ -766,9 +745,6 @@ 
                 assert not dir
                 return mfs.__getitem__
 
-            if dir:
-                return tmfnodes[dir].get
-
             def lookupmflinknode(x):
                 """Callback for looking up the linknode for manifests.
 
@@ -785,43 +761,34 @@ 
                 the client before you can trust the list of files and
                 treemanifests to send.
                 """
-                clnode = mfs[x]
-                # We no longer actually care about reading deltas of
-                # the manifest here, because we already know the list
-                # of changed files, so for treemanifests (which
-                # lazily-load anyway to *generate* a readdelta) we can
-                # just load them with read() and then we'll actually
-                # be able to correctly load node IDs from the
-                # submanifest entries.
+                clnode = tmfnodes[dir][x]
+                mdata = dirlog(dir).readshallowfast(x)
                 if 'treemanifest' in repo.requirements:
-                    mdata = ml.read(x)
+                    for p, n, fl in mdata.iterentries():
+                        if fl == 't': # subdirectory manifest
+                            subdir = dir + p + '/'
+                            tmfclnodes = tmfnodes.setdefault(subdir, {})
+                            tmfclnode = tmfclnodes.setdefault(n, clnode)
+                            if clrevorder[clnode] < clrevorder[tmfclnode]:
+                                tmfclnodes[n] = clnode
+                        else:
+                            f = dir + p
+                            fclnodes = fnodes.setdefault(f, {})
+                            fclnode = fclnodes.setdefault(n, clnode)
+                            if clrevorder[clnode] < clrevorder[fclnode]:
+                                fclnodes[n] = clnode
                 else:
-                    mdata = ml.readfast(x)
-                for f in mfchangedfiles[x]:
-                    try:
-                        n = mdata[f]
-                    except KeyError:
-                        continue
-                    # record the first changeset introducing this filelog
-                    # version
-                    fclnodes = fnodes.setdefault(f, {})
-                    fclnode = fclnodes.setdefault(n, clnode)
-                    if clrevorder[clnode] < clrevorder[fclnode]:
-                        fclnodes[n] = clnode
-                # gather list of changed treemanifest nodes
-                if 'treemanifest' in repo.requirements:
-                    submfs = {'/': mdata}
-                    for dn, bn in _moddirs(mfchangedfiles[x]):
+                    for f in mfchangedfiles[x]:
                         try:
-                            submf = submfs[dn]
-                            submf = submf._dirs[bn]
+                            n = mdata[f]
                         except KeyError:
-                            continue # deleted directory, so nothing to send
-                        submfs[submf.dir()] = submf
-                        tmfclnodes = tmfnodes.setdefault(submf.dir(), {})
-                        tmfclnode = tmfclnodes.setdefault(submf._node, clnode)
-                        if clrevorder[clnode] < clrevorder[tmfclnode]:
-                            tmfclnodes[n] = clnode
+                            continue
+                        # record the first changeset introducing this filelog
+                        # version
+                        fclnodes = fnodes.setdefault(f, {})
+                        fclnode = fclnodes.setdefault(n, clnode)
+                        if clrevorder[clnode] < clrevorder[fclnode]:
+                            fclnodes[n] = clnode
                 return clnode
             return lookupmflinknode
 
@@ -829,7 +796,7 @@ 
         while tmfnodes:
             dir = min(tmfnodes)
             nodes = tmfnodes[dir]
-            prunednodes = self.prune(ml.dirlog(dir), nodes, commonrevs)
+            prunednodes = self.prune(dirlog(dir), nodes, commonrevs)
             for x in self._packmanifests(dir, prunednodes,
                                          makelookupmflinknode(dir)):
                 size += len(x)
diff -r 54d19d993e13 -r 1ab4ec0c1cc6 mercurial/manifest.py
--- a/mercurial/manifest.py	Fri Feb 12 23:30:18 2016 -0800
+++ b/mercurial/manifest.py	Fri Feb 12 23:09:09 2016 -0800
@@ -959,6 +959,15 @@ 
             return self.readdelta(node)
         return self.read(node)
 
+    def readshallowfast(self, node):
+        '''like readfast(), but calls readshallowdelta() instead of readdelta()
+        '''
+        r = self.rev(node)
+        deltaparent = self.deltaparent(r)
+        if deltaparent != revlog.nullrev and deltaparent in self.parentrevs(r):
+            return self.readshallowdelta(node)
+        return self.readshallow(node)
+
     def read(self, node):
         if node == revlog.nullid:
             return self._newmanifest() # don't upset local cache
@@ -980,6 +989,13 @@ 
         self._mancache[node] = (m, arraytext)
         return m
 
+    def readshallow(self, node):
+        '''Reads the manifest in this directory. When using flat manifests,
+        this manifest will generally have files in subdirectories in it. Does
+        not cache the manifest as the callers generally do not read the same
+        version twice.'''
+        return manifestdict(self.revision(node))
+
     def find(self, node, f):
         '''look up entry for a single file efficiently.
         return (node, flags) pair if found, (None, None) if not.'''
diff -r 54d19d993e13 -r 1ab4ec0c1cc6 tests/test-treemanifest.t
--- a/tests/test-treemanifest.t	Fri Feb 12 23:30:18 2016 -0800
+++ b/tests/test-treemanifest.t	Fri Feb 12 23:09:09 2016 -0800
@@ -363,6 +363,13 @@ 
   added 11 changesets with 15 changes to 10 files (+3 heads)
   $ grep treemanifest clone/.hg/requires
   treemanifest
+  $ hg -R clone verify
+  checking changesets
+  checking manifests
+  checking directory manifests
+  crosschecking files in changesets and manifests
+  checking files
+  10 files, 11 changesets, 15 total revisions
 
 Create deeper repo with tree manifests.