Patchwork [1,of,2] copies: add matcher parameter to copy logic

login
register
mail settings
Submitter Durham Goode
Date April 16, 2015, 7:03 p.m.
Message ID <0270a44b46aabbaf794d.1429210987@dev2000.prn2.facebook.com>
Download mbox | patch
Permalink /patch/8720/
State Accepted
Headers show

Comments

Durham Goode - April 16, 2015, 7:03 p.m.
# HG changeset patch
# User Durham Goode <durham@fb.com>
# Date 1429208970 25200
#      Thu Apr 16 11:29:30 2015 -0700
# Node ID 0270a44b46aabbaf794dbd62b08b622313fe1b85
# Parent  c560d8c687916cb70a6d54c2c9ddcb5c9e457be2
copies: add matcher parameter to copy logic

This allows passing a matcher down the pathcopies() stack to _forwardcopies().
This will let us add logic in a later patch to avoid tracing copies when not
necessary (like when doing hg diff -r 1 -r 2 foo.txt).
Martin von Zweigbergk - April 16, 2015, 9:13 p.m.
On Thu, Apr 16, 2015 at 12:05 PM Durham Goode <durham@fb.com> wrote:

> # HG changeset patch
> # User Durham Goode <durham@fb.com>
> # Date 1429208970 25200
> #      Thu Apr 16 11:29:30 2015 -0700
> # Node ID 0270a44b46aabbaf794dbd62b08b622313fe1b85
> # Parent  c560d8c687916cb70a6d54c2c9ddcb5c9e457be2
> copies: add matcher parameter to copy logic
>
> This allows passing a matcher down the pathcopies() stack to
> _forwardcopies().
> This will let us add logic in a later patch to avoid tracing copies when
> not
> necessary (like when doing hg diff -r 1 -r 2 foo.txt).
>
> diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
> --- a/hgext/largefiles/overrides.py
> +++ b/hgext/largefiles/overrides.py
> @@ -547,8 +547,8 @@ def overridefilemerge(origfn, repo, myno
>          repo.wwrite(fcd.path(), fco.data(), fco.flags())
>      return 0
>
> -def copiespathcopies(orig, ctx1, ctx2):
> -    copies = orig(ctx1, ctx2)
> +def copiespathcopies(orig, ctx1, ctx2, match=None):
> +    copies = orig(ctx1, ctx2, match=match)
>      updated = {}
>
>      for k, v in copies.iteritems():
> diff --git a/mercurial/copies.py b/mercurial/copies.py
> --- a/mercurial/copies.py
> +++ b/mercurial/copies.py
> @@ -140,14 +140,16 @@ def _dirstatecopies(d):
>              del c[k]
>      return c
>
> -def _computeforwardmissing(a, b):
> +def _computeforwardmissing(a, b, match=None):
>      """Computes which files are in b but not a.
>      This is its own function so extensions can easily wrap this call to
> see what
>      files _forwardcopies is about to process.
>      """
> -    return b.manifest().filesnotin(a.manifest())
> +    for f in b.manifest().filesnotin(a.manifest()):
> +        if not match or match(f):
> +            yield f
>

I'd rather have this written as:
ma = a.manifest()
mb = b.manifest()
if match:
  ma = ma.matches(m)
  mb = mb.matches(m)
return mb.filesnotin(ma)

That avoids loading all modified subtrees when using tree manifests.
Hopefully it's not noticeably slower on flat manifests.

Also, you could do those steps in the caller. I don't remember if you need
the contexts for remotefilelog.

-def _forwardcopies(a, b):
+def _forwardcopies(a, b, match=None):
     '''find {dst@b: src@a} copy mapping where a is an ancestor of b'''

     # check for working copy
@@ -170,7 +172,7 @@ def _forwardcopies(a, b):
     # we currently don't try to find where old files went, too expensive
     # this means we can miss a case like 'hg rm b; hg cp a b'
     cm = {}
-    missing = _computeforwardmissing(a, b)
+    missing = _computeforwardmissing(a, b, match=match)
     ancestrycontext = a._repo.changelog.ancestors([b.rev()],
inclusive=True)
     for f in missing:
         fctx = b[f]
@@ -198,16 +200,17 @@ def _backwardrenames(a, b):
         r[v] = k
     return r

-def pathcopies(x, y):
+def pathcopies(x, y, match=None):
     '''find {dst@y: src@x} copy mapping for directed compare'''
     if x == y or not x or not y:
         return {}
     a = y.ancestor(x)
     if a == x:
-        return _forwardcopies(x, y)
+        return _forwardcopies(x, y, match=match)
     if a == y:
         return _backwardrenames(x, y)
-    return _chain(x, y, _backwardrenames(x, a), _forwardcopies(a, y))
+    return _chain(x, y, _backwardrenames(x, a),
+                  _forwardcopies(a, y, match=match))

 def _computenonoverlap(repo, c1, c2, addedinm1, addedinm2):
     """Computes, based on addedinm1 and addedinm2, the files exclusive to
c1
Durham Goode - April 16, 2015, 9:20 p.m.
On 4/16/15 5:13 PM, Martin von Zweigbergk wrote:
>
>
> On Thu, Apr 16, 2015 at 12:05 PM Durham Goode <durham@fb.com 
> <mailto:durham@fb.com>> wrote:
>
>     # HG changeset patch
>     # User Durham Goode <durham@fb.com <mailto:durham@fb.com>>
>     # Date 1429208970 25200
>     #      Thu Apr 16 11:29:30 2015 -0700
>     # Node ID 0270a44b46aabbaf794dbd62b08b622313fe1b85
>     # Parent  c560d8c687916cb70a6d54c2c9ddcb5c9e457be2
>     copies: add matcher parameter to copy logic
>
>     This allows passing a matcher down the pathcopies() stack to
>     _forwardcopies().
>     This will let us add logic in a later patch to avoid tracing
>     copies when not
>     necessary (like when doing hg diff -r 1 -r 2 foo.txt).
>
>     diff --git a/hgext/largefiles/overrides.py
>     b/hgext/largefiles/overrides.py
>     --- a/hgext/largefiles/overrides.py
>     +++ b/hgext/largefiles/overrides.py
>     @@ -547,8 +547,8 @@ def overridefilemerge(origfn, repo, myno
>              repo.wwrite(fcd.path(), fco.data(), fco.flags())
>          return 0
>
>     -def copiespathcopies(orig, ctx1, ctx2):
>     -    copies = orig(ctx1, ctx2)
>     +def copiespathcopies(orig, ctx1, ctx2, match=None):
>     +    copies = orig(ctx1, ctx2, match=match)
>          updated = {}
>
>          for k, v in copies.iteritems():
>     diff --git a/mercurial/copies.py b/mercurial/copies.py
>     --- a/mercurial/copies.py
>     +++ b/mercurial/copies.py
>     @@ -140,14 +140,16 @@ def _dirstatecopies(d):
>                  del c[k]
>          return c
>
>     -def _computeforwardmissing(a, b):
>     +def _computeforwardmissing(a, b, match=None):
>          """Computes which files are in b but not a.
>          This is its own function so extensions can easily wrap this
>     call to see what
>          files _forwardcopies is about to process.
>          """
>     -    return b.manifest().filesnotin(a.manifest())
>     +    for f in b.manifest().filesnotin(a.manifest()):
>     +        if not match or match(f):
>     +            yield f
>
>
> I'd rather have this written as:
> ma = a.manifest()
> mb = b.manifest()
> if match:
>   ma = ma.matches(m)
>   mb = mb.matches(m)
> return mb.filesnotin(ma)
>
> That avoids loading all modified subtrees when using tree manifests. 
> Hopefully it's not noticeably slower on flat manifests.
I can change it to that pattern.

It looks like prior to this change, the entire return value is iterated 
over by the caller, which would be bad for narrow manifests, right?  Are 
you saying that having the matcher here might have the side benefit of 
helping narrow manifests? (as long as we use this pattern you mentioned) 
Just so I'm clear on the effect.
>
> Also, you could do those steps in the caller. I don't remember if you 
> need the contexts for remotefilelog.
I need this in remotefilelog so I don't prefetch file contents that 
aren't needed.
Martin von Zweigbergk - April 16, 2015, 9:26 p.m.
On Thu, Apr 16, 2015 at 2:20 PM Durham Goode <durham@fb.com> wrote:

>
>
> On 4/16/15 5:13 PM, Martin von Zweigbergk wrote:
>
>
>
> On Thu, Apr 16, 2015 at 12:05 PM Durham Goode <durham@fb.com> wrote:
>
>> # HG changeset patch
>> # User Durham Goode <durham@fb.com>
>> # Date 1429208970 25200
>> #      Thu Apr 16 11:29:30 2015 -0700
>> # Node ID 0270a44b46aabbaf794dbd62b08b622313fe1b85
>> # Parent  c560d8c687916cb70a6d54c2c9ddcb5c9e457be2
>> copies: add matcher parameter to copy logic
>>
>> This allows passing a matcher down the pathcopies() stack to
>> _forwardcopies().
>> This will let us add logic in a later patch to avoid tracing copies when
>> not
>> necessary (like when doing hg diff -r 1 -r 2 foo.txt).
>>
>> diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
>> --- a/hgext/largefiles/overrides.py
>> +++ b/hgext/largefiles/overrides.py
>> @@ -547,8 +547,8 @@ def overridefilemerge(origfn, repo, myno
>>          repo.wwrite(fcd.path(), fco.data(), fco.flags())
>>      return 0
>>
>> -def copiespathcopies(orig, ctx1, ctx2):
>> -    copies = orig(ctx1, ctx2)
>> +def copiespathcopies(orig, ctx1, ctx2, match=None):
>> +    copies = orig(ctx1, ctx2, match=match)
>>      updated = {}
>>
>>      for k, v in copies.iteritems():
>> diff --git a/mercurial/copies.py b/mercurial/copies.py
>> --- a/mercurial/copies.py
>> +++ b/mercurial/copies.py
>> @@ -140,14 +140,16 @@ def _dirstatecopies(d):
>>              del c[k]
>>      return c
>>
>> -def _computeforwardmissing(a, b):
>> +def _computeforwardmissing(a, b, match=None):
>>      """Computes which files are in b but not a.
>>      This is its own function so extensions can easily wrap this call to
>> see what
>>      files _forwardcopies is about to process.
>>      """
>> -    return b.manifest().filesnotin(a.manifest())
>> +    for f in b.manifest().filesnotin(a.manifest()):
>> +        if not match or match(f):
>> +            yield f
>>
>
>  I'd rather have this written as:
>  ma = a.manifest()
> mb = b.manifest()
> if match:
>   ma = ma.matches(m)
>   mb = mb.matches(m)
> return mb.filesnotin(ma)
>
>  That avoids loading all modified subtrees when using tree manifests.
> Hopefully it's not noticeably slower on flat manifests.
>
> I can change it to that pattern.
>
> It looks like prior to this change, the entire return value is iterated
> over by the caller, which would be bad for narrow manifests, right?  Are
> you saying that having the matcher here might have the side benefit of
> helping narrow manifests? (as long as we use this pattern you mentioned)
> Just so I'm clear on the effect.
>

Correct. Once lazy loading of sub-manifests is in, it would load only the
sub-manifests that differ between a and b. After your change, it would load
even fewer. Thanks :-)

PS. There's a worse case in copies.mergecopies(), where it calls dirs() on
both contexts, which would require loading all the sub-manifests. I'm
planning on changing that at some point.


>
>
>  Also, you could do those steps in the caller. I don't remember if you
> need the contexts for remotefilelog.
>
> I need this in remotefilelog so I don't prefetch file contents that aren't
> needed.
>

Patch

diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
--- a/hgext/largefiles/overrides.py
+++ b/hgext/largefiles/overrides.py
@@ -547,8 +547,8 @@  def overridefilemerge(origfn, repo, myno
         repo.wwrite(fcd.path(), fco.data(), fco.flags())
     return 0
 
-def copiespathcopies(orig, ctx1, ctx2):
-    copies = orig(ctx1, ctx2)
+def copiespathcopies(orig, ctx1, ctx2, match=None):
+    copies = orig(ctx1, ctx2, match=match)
     updated = {}
 
     for k, v in copies.iteritems():
diff --git a/mercurial/copies.py b/mercurial/copies.py
--- a/mercurial/copies.py
+++ b/mercurial/copies.py
@@ -140,14 +140,16 @@  def _dirstatecopies(d):
             del c[k]
     return c
 
-def _computeforwardmissing(a, b):
+def _computeforwardmissing(a, b, match=None):
     """Computes which files are in b but not a.
     This is its own function so extensions can easily wrap this call to see what
     files _forwardcopies is about to process.
     """
-    return b.manifest().filesnotin(a.manifest())
+    for f in b.manifest().filesnotin(a.manifest()):
+        if not match or match(f):
+            yield f
 
-def _forwardcopies(a, b):
+def _forwardcopies(a, b, match=None):
     '''find {dst@b: src@a} copy mapping where a is an ancestor of b'''
 
     # check for working copy
@@ -170,7 +172,7 @@  def _forwardcopies(a, b):
     # we currently don't try to find where old files went, too expensive
     # this means we can miss a case like 'hg rm b; hg cp a b'
     cm = {}
-    missing = _computeforwardmissing(a, b)
+    missing = _computeforwardmissing(a, b, match=match)
     ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
     for f in missing:
         fctx = b[f]
@@ -198,16 +200,17 @@  def _backwardrenames(a, b):
         r[v] = k
     return r
 
-def pathcopies(x, y):
+def pathcopies(x, y, match=None):
     '''find {dst@y: src@x} copy mapping for directed compare'''
     if x == y or not x or not y:
         return {}
     a = y.ancestor(x)
     if a == x:
-        return _forwardcopies(x, y)
+        return _forwardcopies(x, y, match=match)
     if a == y:
         return _backwardrenames(x, y)
-    return _chain(x, y, _backwardrenames(x, a), _forwardcopies(a, y))
+    return _chain(x, y, _backwardrenames(x, a),
+                  _forwardcopies(a, y, match=match))
 
 def _computenonoverlap(repo, c1, c2, addedinm1, addedinm2):
     """Computes, based on addedinm1 and addedinm2, the files exclusive to c1