Patchwork [1,of,6,cg3,v3] changegroup: avoid iterating the whole manifest

login
register
mail settings
Submitter Augie Fackler
Date Dec. 4, 2015, 7:38 p.m.
Message ID <bfdd624c667b6f72f206.1449257930@arthedain.pit.corp.google.com>
Download mbox | patch
Permalink /patch/11819/
State Accepted
Headers show

Comments

Augie Fackler - Dec. 4, 2015, 7:38 p.m.
# HG changeset patch
# User Augie Fackler <augie@google.com>
# Date 1449243298 18000
#      Fri Dec 04 10:34:58 2015 -0500
# Node ID bfdd624c667b6f72f2069a3537becad05e162c08
# Parent  30a20167ae29e1874163b59a489de0cb1d859565
# EXP-Topic cg3
changegroup: avoid iterating the whole manifest

The old code gathered the list of all files that changed anywhere in
history and then gathered changed file nodes by walking the entirety
of each manifest to be sent in order to gather changed file
nodes. That's going to be unfortunate for narrowhg, and it's probably
already inefficient for medium-to-large repositories.
Gregory Szorc - Dec. 4, 2015, 9:30 p.m.
On Fri, Dec 4, 2015 at 11:38 AM, Augie Fackler <raf@durin42.com> wrote:

> # HG changeset patch
> # User Augie Fackler <augie@google.com>
> # Date 1449243298 18000
> #      Fri Dec 04 10:34:58 2015 -0500
> # Node ID bfdd624c667b6f72f2069a3537becad05e162c08
> # Parent  30a20167ae29e1874163b59a489de0cb1d859565
> # EXP-Topic cg3
> changegroup: avoid iterating the whole manifest
>
> The old code gathered the list of all files that changed anywhere in
> history and then gathered changed file nodes by walking the entirety
> of each manifest to be sent in order to gather changed file
> nodes. That's going to be unfortunate for narrowhg, and it's probably
> already inefficient for medium-to-large repositories.
>
> diff --git a/mercurial/changegroup.py b/mercurial/changegroup.py
> --- a/mercurial/changegroup.py
> +++ b/mercurial/changegroup.py
> @@ -613,7 +613,8 @@ class cg1packer(object):
>          clrevorder = {}
>          mfs = {} # needed manifests
>          fnodes = {} # needed file nodes
> -        changedfiles = set()
> +        # maps manifest node id -> set(changed files)
> +        mfchangedfiles = {}
>
>          # Callback for the changelog, used to collect changed files and
> manifest
>          # nodes.
> @@ -621,9 +622,12 @@ class cg1packer(object):
>          def lookupcl(x):
>              c = cl.read(x)
>              clrevorder[x] = len(clrevorder)
> -            changedfiles.update(c[3])
> +            n = c[0]
>              # record the first changeset introducing this manifest version
> -            mfs.setdefault(c[0], x)
> +            mfs.setdefault(n, x)
> +            # Record a complete list of potentially-changed files in
> +            # this manifest.
> +            mfchangedfiles.setdefault(n, set()).update(c[3])
>              return x
>
>          self._verbosenote(_('uncompressed size of bundle content:\n'))
> @@ -668,8 +672,12 @@ class cg1packer(object):
>              clnode = mfs[x]
>              if not fastpathlinkrev:
>                  mdata = ml.readfast(x)
> -                for f, n in mdata.iteritems():
> -                    if f in changedfiles:
> +                for f in mfchangedfiles[x]:
> +                    if True:
> +                        try:
> +                            n = mdata[f]
> +                        except KeyError:
> +                            continue
>                          # record the first changeset introducing this
> filelog
>                          # version
>                          fclnodes = fnodes.setdefault(f, {})
> @@ -696,6 +704,9 @@ class cg1packer(object):
>                  return dict(genfilenodes())
>              return fnodes.get(fname, {})
>
> +        changedfiles = set()
> +        for x in mfchangedfiles.itervalues():
> +            changedfiles.update(x)
>          for chunk in self.generatefiles(changedfiles, linknodes,
> commonrevs,
>                                          source):
>              yield chunk
>

I haven't audited this completely, but there was an old bug in the rebase
extension (IIRC) that caused the changeset's files list to not be thorough.
(There may have been several variations of this actually.) You'd have to
diff the manifests to actually pick up all modified files.

So code like this that takes the shortcut of looking at the changeset's
file list instead of walking manifests makes me scared.
Augie Fackler - Dec. 4, 2015, 9:33 p.m.
On Fri, Dec 4, 2015 at 4:30 PM, Gregory Szorc <gregory.szorc@gmail.com> wrote:
> On Fri, Dec 4, 2015 at 11:38 AM, Augie Fackler <raf@durin42.com> wrote:
>>
>> # HG changeset patch
>> # User Augie Fackler <augie@google.com>
>> # Date 1449243298 18000
>> #      Fri Dec 04 10:34:58 2015 -0500
>> # Node ID bfdd624c667b6f72f2069a3537becad05e162c08
>> # Parent  30a20167ae29e1874163b59a489de0cb1d859565
>> # EXP-Topic cg3
>> changegroup: avoid iterating the whole manifest
>>
>> The old code gathered the list of all files that changed anywhere in
>> history and then gathered changed file nodes by walking the entirety
>> of each manifest to be sent in order to gather changed file
>> nodes. That's going to be unfortunate for narrowhg, and it's probably
>> already inefficient for medium-to-large repositories.
>>
>> diff --git a/mercurial/changegroup.py b/mercurial/changegroup.py
>> --- a/mercurial/changegroup.py
>> +++ b/mercurial/changegroup.py
>> @@ -613,7 +613,8 @@ class cg1packer(object):
>>          clrevorder = {}
>>          mfs = {} # needed manifests
>>          fnodes = {} # needed file nodes
>> -        changedfiles = set()
>> +        # maps manifest node id -> set(changed files)
>> +        mfchangedfiles = {}
>>
>>          # Callback for the changelog, used to collect changed files and
>> manifest
>>          # nodes.
>> @@ -621,9 +622,12 @@ class cg1packer(object):
>>          def lookupcl(x):
>>              c = cl.read(x)
>>              clrevorder[x] = len(clrevorder)
>> -            changedfiles.update(c[3])
>> +            n = c[0]
>>              # record the first changeset introducing this manifest
>> version
>> -            mfs.setdefault(c[0], x)
>> +            mfs.setdefault(n, x)
>> +            # Record a complete list of potentially-changed files in
>> +            # this manifest.
>> +            mfchangedfiles.setdefault(n, set()).update(c[3])
>>              return x
>>
>>          self._verbosenote(_('uncompressed size of bundle content:\n'))
>> @@ -668,8 +672,12 @@ class cg1packer(object):
>>              clnode = mfs[x]
>>              if not fastpathlinkrev:
>>                  mdata = ml.readfast(x)
>> -                for f, n in mdata.iteritems():
>> -                    if f in changedfiles:
>> +                for f in mfchangedfiles[x]:
>> +                    if True:
>> +                        try:
>> +                            n = mdata[f]
>> +                        except KeyError:
>> +                            continue
>>                          # record the first changeset introducing this
>> filelog
>>                          # version
>>                          fclnodes = fnodes.setdefault(f, {})
>> @@ -696,6 +704,9 @@ class cg1packer(object):
>>                  return dict(genfilenodes())
>>              return fnodes.get(fname, {})
>>
>> +        changedfiles = set()
>> +        for x in mfchangedfiles.itervalues():
>> +            changedfiles.update(x)
>>          for chunk in self.generatefiles(changedfiles, linknodes,
>> commonrevs,
>>                                          source):
>>              yield chunk
>
>
> I haven't audited this completely, but there was an old bug in the rebase
> extension (IIRC) that caused the changeset's files list to not be thorough.
> (There may have been several variations of this actually.) You'd have to
> diff the manifests to actually pick up all modified files.
>
> So code like this that takes the shortcut of looking at the changeset's file
> list instead of walking manifests makes me scared.

Look more closely. The old version took the union of the file lists
from all changesets and then looked at only those files in the
manifests anyway, it just did it in the dumbest way possible. The only
way the rebase bug you'd describe would be avoided by the old code and
not by the new code would be if the cg was going to include the file
anyway because it was touched in some other change in the changegroup.

Patch

diff --git a/mercurial/changegroup.py b/mercurial/changegroup.py
--- a/mercurial/changegroup.py
+++ b/mercurial/changegroup.py
@@ -613,7 +613,8 @@  class cg1packer(object):
         clrevorder = {}
         mfs = {} # needed manifests
         fnodes = {} # needed file nodes
-        changedfiles = set()
+        # maps manifest node id -> set(changed files)
+        mfchangedfiles = {}
 
         # Callback for the changelog, used to collect changed files and manifest
         # nodes.
@@ -621,9 +622,12 @@  class cg1packer(object):
         def lookupcl(x):
             c = cl.read(x)
             clrevorder[x] = len(clrevorder)
-            changedfiles.update(c[3])
+            n = c[0]
             # record the first changeset introducing this manifest version
-            mfs.setdefault(c[0], x)
+            mfs.setdefault(n, x)
+            # Record a complete list of potentially-changed files in
+            # this manifest.
+            mfchangedfiles.setdefault(n, set()).update(c[3])
             return x
 
         self._verbosenote(_('uncompressed size of bundle content:\n'))
@@ -668,8 +672,12 @@  class cg1packer(object):
             clnode = mfs[x]
             if not fastpathlinkrev:
                 mdata = ml.readfast(x)
-                for f, n in mdata.iteritems():
-                    if f in changedfiles:
+                for f in mfchangedfiles[x]:
+                    if True:
+                        try:
+                            n = mdata[f]
+                        except KeyError:
+                            continue
                         # record the first changeset introducing this filelog
                         # version
                         fclnodes = fnodes.setdefault(f, {})
@@ -696,6 +704,9 @@  class cg1packer(object):
                 return dict(genfilenodes())
             return fnodes.get(fname, {})
 
+        changedfiles = set()
+        for x in mfchangedfiles.itervalues():
+            changedfiles.update(x)
         for chunk in self.generatefiles(changedfiles, linknodes, commonrevs,
                                         source):
             yield chunk