Patchwork [6,of,6] manifest: rename matches to _matches

login
register
mail settings
Submitter Durham Goode
Date March 3, 2017, 7:34 p.m.
Message ID <1bed8ee65389133923b9.1488569661@dev111.prn1.facebook.com>
Download mbox | patch
Permalink /patch/18905/
State Accepted
Delegated to: Martin von Zweigbergk
Headers show

Comments

Durham Goode - March 3, 2017, 7:34 p.m.
# HG changeset patch
# User Durham Goode <durham@fb.com>
# Date 1488517595 28800
#      Thu Mar 02 21:06:35 2017 -0800
# Node ID 1bed8ee65389133923b9076909e04f66ef26f1b8
# Parent  207763e895c7d24885df22f5b9c0df5494d77daf
manifest: rename matches to _matches

Now that there are no external consumers of manifest.matches, let's rename it to
_matches so it remains internal. This means alternative manifest implementations
no longer have to implement it, and can instead implement efficient versions of
diff() and filesnotin().
via Mercurial-devel - March 7, 2017, 1:32 a.m.
On Fri, Mar 3, 2017 at 11:34 AM, Durham Goode <durham@fb.com> wrote:
> # HG changeset patch
> # User Durham Goode <durham@fb.com>
> # Date 1488517595 28800
> #      Thu Mar 02 21:06:35 2017 -0800
> # Node ID 1bed8ee65389133923b9076909e04f66ef26f1b8
> # Parent  207763e895c7d24885df22f5b9c0df5494d77daf
> manifest: rename matches to _matches
>
> Now that there are no external consumers of manifest.matches, let's rename it to
> _matches so it remains internal. This means alternative manifest implementations
> no longer have to implement it, and can instead implement efficient versions of
> diff() and filesnotin().

matches() still seems like a sensible method. Once "hg files" gains a
"-v" or "--flags" option so it can really replace "hg manifest" (or if
"hg manifest" started accepting a file pattern), it seems natural to
implement that using manifest.matches(). Would it be a problem to
queue patches 1-5 but not this one?

>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -448,8 +448,8 @@ class manifestdict(object):
>      def filesnotin(self, m2, match=None):
>          '''Set of files in this manifest that are not in the other'''
>          if match:
> -            m1 = self.matches(match)
> -            m2 = m2.matches(match)
> +            m1 = self._matches(match)
> +            m2 = m2._matches(match)
>              return m1.filesnotin(m2)
>          diff = self.diff(m2)
>          files = set(filepath
> @@ -510,7 +510,7 @@ class manifestdict(object):
>              if not self.hasdir(fn):
>                  match.bad(fn, None)
>
> -    def matches(self, match):
> +    def _matches(self, match):
>          '''generate a new manifest filtered by the match argument'''
>          if match.always():
>              return self.copy()
> @@ -543,8 +543,8 @@ class manifestdict(object):
>          string.
>          '''
>          if match:
> -            m1 = self.matches(match)
> -            m2 = m2.matches(match)
> +            m1 = self._matches(match)
> +            m2 = m2._matches(match)
>              return m1.diff(m2, clean=clean)
>          return self._lm.diff(m2._lm, clean)
>
> @@ -917,8 +917,8 @@ class treemanifest(object):
>      def filesnotin(self, m2, match=None):
>          '''Set of files in this manifest that are not in the other'''
>          if match:
> -            m1 = self.matches(match)
> -            m2 = m2.matches(match)
> +            m1 = self._matches(match)
> +            m2 = m2._matches(match)
>              return m1.filesnotin(m2)
>
>          files = set()
> @@ -1002,7 +1002,7 @@ class treemanifest(object):
>                  for f in self._dirs[p]._walk(match):
>                      yield f
>
> -    def matches(self, match):
> +    def _matches(self, match):
>          '''generate a new manifest filtered by the match argument'''
>          if match.always():
>              return self.copy()
> @@ -1054,8 +1054,8 @@ class treemanifest(object):
>          string.
>          '''
>          if match:
> -            m1 = self.matches(match)
> -            m2 = m2.matches(match)
> +            m1 = self._matches(match)
> +            m2 = m2._matches(match)
>              return m1.diff(m2, clean=clean)
>          result = {}
>          emptytree = treemanifest()
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> --- a/tests/test-manifest.py
> +++ b/tests/test-manifest.py
> @@ -233,7 +233,7 @@ class basemanifesttests(object):
>          self.assertEqual(want, m['foo'])
>          # make sure the suffix survives a copy
>          match = matchmod.match('', '', ['re:foo'])
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>          self.assertEqual(want, m2['foo'])
>          self.assertEqual(1, len(m2))
>          m2 = m.copy()
> @@ -255,7 +255,7 @@ class basemanifesttests(object):
>                  assert False
>              return True
>          match.matchfn = filt
> -        self.assertRaises(AssertionError, m.matches, match)
> +        self.assertRaises(AssertionError, m._matches, match)
>
>      def testRemoveItem(self):
>          m = self.parsemanifest(A_SHORT_MANIFEST)
> @@ -358,7 +358,7 @@ class basemanifesttests(object):
>
>          match = matchmod.match('/', '',
>                  ['file1', 'file200', 'file300'], exact=True)
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          w = ('file1\0%sx\n'
>               'file200\0%sl\n'
> @@ -374,7 +374,7 @@ class basemanifesttests(object):
>          match = matchmod.match('/', '',
>                  ['a/b/c/bar.txt', 'a/b/d/qux.py', 'readme.txt', 'nonexistent'],
>                  exact=True)
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          self.assertEqual(
>                  ['a/b/c/bar.txt', 'a/b/d/qux.py', 'readme.txt'],
> @@ -386,7 +386,7 @@ class basemanifesttests(object):
>          m = self.parsemanifest(A_DEEPER_MANIFEST)
>
>          match = matchmod.match('/', '', ['a/f'], default='relpath')
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          self.assertEqual([], m2.keys())
>
> @@ -397,7 +397,7 @@ class basemanifesttests(object):
>
>          flist = m.keys()[80:300]
>          match = matchmod.match('/', '', flist, exact=True)
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          self.assertEqual(flist, m2.keys())
>
> @@ -406,7 +406,7 @@ class basemanifesttests(object):
>          m = self.parsemanifest(A_DEEPER_MANIFEST)
>
>          match = matchmod.match('/', '', [''])
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          self.assertEqual(m.keys(), m2.keys())
>
> @@ -416,7 +416,7 @@ class basemanifesttests(object):
>          m = self.parsemanifest(A_DEEPER_MANIFEST)
>
>          match = matchmod.match('/', '', ['a/b'], default='relpath')
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          self.assertEqual([
>              'a/b/c/bar.py', 'a/b/c/bar.txt', 'a/b/c/foo.py', 'a/b/c/foo.txt',
> @@ -430,7 +430,7 @@ class basemanifesttests(object):
>          m = self.parsemanifest(A_DEEPER_MANIFEST)
>
>          match = matchmod.match('/', '', ['a/b'], exact=True)
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          self.assertEqual([], m2.keys())
>
> @@ -440,7 +440,7 @@ class basemanifesttests(object):
>          m = self.parsemanifest(A_DEEPER_MANIFEST)
>
>          match = matchmod.match('/', 'a/b', ['.'], default='relpath')
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          self.assertEqual([
>              'a/b/c/bar.py', 'a/b/c/bar.txt', 'a/b/c/foo.py', 'a/b/c/foo.txt',
> @@ -453,7 +453,7 @@ class basemanifesttests(object):
>          m = self.parsemanifest(A_DEEPER_MANIFEST)
>
>          match = matchmod.match('/', '', ['a/b/*/*.txt'])
> -        m2 = m.matches(match)
> +        m2 = m._matches(match)
>
>          self.assertEqual(
>                  ['a/b/c/bar.txt', 'a/b/c/foo.txt', 'a/b/d/ten.txt'],
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Durham Goode - March 7, 2017, 1:40 a.m.
On 3/6/17 5:32 PM, Martin von Zweigbergk wrote:
> On Fri, Mar 3, 2017 at 11:34 AM, Durham Goode <durham@fb.com> wrote:
>> # HG changeset patch
>> # User Durham Goode <durham@fb.com>
>> # Date 1488517595 28800
>> #      Thu Mar 02 21:06:35 2017 -0800
>> # Node ID 1bed8ee65389133923b9076909e04f66ef26f1b8
>> # Parent  207763e895c7d24885df22f5b9c0df5494d77daf
>> manifest: rename matches to _matches
>>
>> Now that there are no external consumers of manifest.matches, let's rename it to
>> _matches so it remains internal. This means alternative manifest implementations
>> no longer have to implement it, and can instead implement efficient versions of
>> diff() and filesnotin().
>
> matches() still seems like a sensible method. Once "hg files" gains a
> "-v" or "--flags" option so it can really replace "hg manifest" (or if
> "hg manifest" started accepting a file pattern), it seems natural to
> implement that using manifest.matches(). Would it be a problem to
> queue patches 1-5 but not this one?

It's not a problem to queue 1-5 without 6, but given the performance 
criticalness of the manifest, I'd lean towards not giving it functions 
that could enable future bad algorithms.  If we need the ability to 
iterate over the entire manifest in the future, maybe we could add a 
manifest.walk(match=None) function that returns an iterator.  The 
'walk()' part exposes caller to the fact that it is going to be an 
expensive operation, and the iterator aspect means we don't have to 
allocate an entire dict and we don't have to worry about people abusing 
it as a way of doing naive set math using manifests.  That gets the 
manifest a little closer to resembling a dirstate as well.
via Mercurial-devel - March 7, 2017, 6:31 a.m.
On Mon, Mar 6, 2017 at 5:40 PM, Durham Goode <durham@fb.com> wrote:
>
>
> On 3/6/17 5:32 PM, Martin von Zweigbergk wrote:
>>
>> On Fri, Mar 3, 2017 at 11:34 AM, Durham Goode <durham@fb.com> wrote:
>>>
>>> # HG changeset patch
>>> # User Durham Goode <durham@fb.com>
>>> # Date 1488517595 28800
>>> #      Thu Mar 02 21:06:35 2017 -0800
>>> # Node ID 1bed8ee65389133923b9076909e04f66ef26f1b8
>>> # Parent  207763e895c7d24885df22f5b9c0df5494d77daf
>>> manifest: rename matches to _matches
>>>
>>> Now that there are no external consumers of manifest.matches, let's
>>> rename it to
>>> _matches so it remains internal. This means alternative manifest
>>> implementations
>>> no longer have to implement it, and can instead implement efficient
>>> versions of
>>> diff() and filesnotin().
>>
>>
>> matches() still seems like a sensible method. Once "hg files" gains a
>> "-v" or "--flags" option so it can really replace "hg manifest" (or if
>> "hg manifest" started accepting a file pattern), it seems natural to
>> implement that using manifest.matches(). Would it be a problem to
>> queue patches 1-5 but not this one?
>
>
> It's not a problem to queue 1-5 without 6, but given the performance
> criticalness of the manifest, I'd lean towards not giving it functions that
> could enable future bad algorithms.  If we need the ability to iterate over
> the entire manifest in the future, maybe we could add a
> manifest.walk(match=None) function that returns an iterator.

That already exists, but it walks the filenames only, not the
(filename,node,flags) tuples :-) But that's besides the point, I know;
we'd just need to find a different name.

> The 'walk()'
> part exposes caller to the fact that it is going to be an expensive
> operation, and the iterator aspect means we don't have to allocate an entire
> dict and we don't have to worry about people abusing it as a way of doing
> naive set math using manifests.  That gets the manifest a little closer to
> resembling a dirstate as well.

That makes sense. However, there's no such warning to users who do
*not* use a matcher. I think I took care of all those cases two years
ago, but if we want to make it less likely that someone does
set(mf.match(m)), then it seems even more important to prevent
set(mf).  So isn't it __iter__ and friends we should make explicit?

I'd also like to be able to verify that this is going in the right
direction. Could you share the remainder of the series too (as a
link)? Thanks.
Durham Goode - March 7, 2017, 5:37 p.m.
On 3/6/17 10:31 PM, Martin von Zweigbergk wrote:
> On Mon, Mar 6, 2017 at 5:40 PM, Durham Goode <durham@fb.com> wrote:
>>
>>
>> On 3/6/17 5:32 PM, Martin von Zweigbergk wrote:
>>>
>>> On Fri, Mar 3, 2017 at 11:34 AM, Durham Goode <durham@fb.com> wrote:
>>>>
>>>> # HG changeset patch
>>>> # User Durham Goode <durham@fb.com>
>>>> # Date 1488517595 28800
>>>> #      Thu Mar 02 21:06:35 2017 -0800
>>>> # Node ID 1bed8ee65389133923b9076909e04f66ef26f1b8
>>>> # Parent  207763e895c7d24885df22f5b9c0df5494d77daf
>>>> manifest: rename matches to _matches
>>>>
>>>> Now that there are no external consumers of manifest.matches, let's
>>>> rename it to
>>>> _matches so it remains internal. This means alternative manifest
>>>> implementations
>>>> no longer have to implement it, and can instead implement efficient
>>>> versions of
>>>> diff() and filesnotin().
>>>
>>>
>>> matches() still seems like a sensible method. Once "hg files" gains a
>>> "-v" or "--flags" option so it can really replace "hg manifest" (or if
>>> "hg manifest" started accepting a file pattern), it seems natural to
>>> implement that using manifest.matches(). Would it be a problem to
>>> queue patches 1-5 but not this one?
>>
>>
>> It's not a problem to queue 1-5 without 6, but given the performance
>> criticalness of the manifest, I'd lean towards not giving it functions that
>> could enable future bad algorithms.  If we need the ability to iterate over
>> the entire manifest in the future, maybe we could add a
>> manifest.walk(match=None) function that returns an iterator.
>
> That already exists, but it walks the filenames only, not the
> (filename,node,flags) tuples :-) But that's besides the point, I know;
> we'd just need to find a different name.
>
>> The 'walk()'
>> part exposes caller to the fact that it is going to be an expensive
>> operation, and the iterator aspect means we don't have to allocate an entire
>> dict and we don't have to worry about people abusing it as a way of doing
>> naive set math using manifests.  That gets the manifest a little closer to
>> resembling a dirstate as well.
>
> That makes sense. However, there's no such warning to users who do
> *not* use a matcher. I think I took care of all those cases two years
> ago, but if we want to make it less likely that someone does
> set(mf.match(m)), then it seems even more important to prevent
> set(mf).  So isn't it __iter__ and friends we should make explicit?

Yep, we should probably head in the direction of making all full 
iteration operations explicit.

> I'd also like to be able to verify that this is going in the right
> direction. Could you share the remainder of the series too (as a
> link)? Thanks.

The only new commits on top of this so far are to change 
treemanifest.diff() to actually limit it's diff based on the visitdir() 
responses of the matcher.  You can see it here: 
https://bpaste.net/show/cba5f57820f1 (two patches, one to update tests, 
one to adjust the diff implementation. I'm working on migrating our 
internal treemanifest imlpementation to use this pattern now, so I can 
get perf numbers. Then I'll come back and do the same for 
treemanifest.filesnotin()
via Mercurial-devel - March 7, 2017, 10:29 p.m.
On Tue, Mar 7, 2017 at 9:37 AM, Durham Goode <durham@fb.com> wrote:
>
>
> On 3/6/17 10:31 PM, Martin von Zweigbergk wrote:
>>
>> On Mon, Mar 6, 2017 at 5:40 PM, Durham Goode <durham@fb.com> wrote:
>>>
>>>
>>>
>>> On 3/6/17 5:32 PM, Martin von Zweigbergk wrote:
>>>>
>>>>
>>>> On Fri, Mar 3, 2017 at 11:34 AM, Durham Goode <durham@fb.com> wrote:
>>>>>
>>>>>
>>>>> # HG changeset patch
>>>>> # User Durham Goode <durham@fb.com>
>>>>> # Date 1488517595 28800
>>>>> #      Thu Mar 02 21:06:35 2017 -0800
>>>>> # Node ID 1bed8ee65389133923b9076909e04f66ef26f1b8
>>>>> # Parent  207763e895c7d24885df22f5b9c0df5494d77daf
>>>>> manifest: rename matches to _matches
>>>>>
>>>>> Now that there are no external consumers of manifest.matches, let's
>>>>> rename it to
>>>>> _matches so it remains internal.

treemanifest already had a _matches(), so now it has two...

>>>>> This means alternative manifest
>>>>> implementations
>>>>> no longer have to implement it, and can instead implement efficient
>>>>> versions of
>>>>> diff() and filesnotin().
>>>>
>>>>
>>>>
>>>> matches() still seems like a sensible method. Once "hg files" gains a
>>>> "-v" or "--flags" option so it can really replace "hg manifest" (or if
>>>> "hg manifest" started accepting a file pattern), it seems natural to
>>>> implement that using manifest.matches(). Would it be a problem to
>>>> queue patches 1-5 but not this one?
>>>
>>>
>>>
>>> It's not a problem to queue 1-5 without 6, but given the performance
>>> criticalness of the manifest, I'd lean towards not giving it functions
>>> that
>>> could enable future bad algorithms.  If we need the ability to iterate
>>> over
>>> the entire manifest in the future, maybe we could add a
>>> manifest.walk(match=None) function that returns an iterator.
>>
>>
>> That already exists, but it walks the filenames only, not the
>> (filename,node,flags) tuples :-) But that's besides the point, I know;
>> we'd just need to find a different name.
>>
>>> The 'walk()'
>>> part exposes caller to the fact that it is going to be an expensive
>>> operation, and the iterator aspect means we don't have to allocate an
>>> entire
>>> dict and we don't have to worry about people abusing it as a way of doing
>>> naive set math using manifests.  That gets the manifest a little closer
>>> to
>>> resembling a dirstate as well.
>>
>>
>> That makes sense. However, there's no such warning to users who do
>> *not* use a matcher. I think I took care of all those cases two years
>> ago, but if we want to make it less likely that someone does
>> set(mf.match(m)), then it seems even more important to prevent
>> set(mf).  So isn't it __iter__ and friends we should make explicit?
>
>
> Yep, we should probably head in the direction of making all full iteration
> operations explicit.
>
>> I'd also like to be able to verify that this is going in the right
>> direction. Could you share the remainder of the series too (as a
>> link)? Thanks.
>
>
> The only new commits on top of this so far are to change treemanifest.diff()
> to actually limit it's diff based on the visitdir() responses of the
> matcher.  You can see it here: https://bpaste.net/show/cba5f57820f1 (two
> patches, one to update tests, one to adjust the diff implementation. I'm
> working on migrating our internal treemanifest imlpementation to use this
> pattern now, so I can get perf numbers. Then I'll come back and do the same
> for treemanifest.filesnotin()

I'd really like to see the performance gain from this series, since
that's the motivating force behind it. I've spent some time trying it
myself. I added these two lines in treemanifest.diff._diff():

            if match and not match.visitdir(t1._dir or '.'):
                return

IIUC, the point of the series is to make diffs and status calls
(across revisions) faster when there are many matching files but a
small diff. I tried "time hg st --rev .~1 --rev . js" on some
arbitrary commit I picked from a treemanifest version of the Mozilla
repo. That speeds up from 1.3s to 0.18s. Nice! It's such a simple fix
to get that speedup, so can I get you to include it in the series? I
would have preferred to see something like:

1. add match argument to diff() and filesnotin() and optimize them
2. make copies use new manifest.diff() api and see how it's faster
3. same for merge
4. ...

That would make it an easier sell for me. But again, what I cared most
about was data to back up your claim, so I'd really like at least to
have that part of the series.

Btw, an alternative approach would have been to make
manifest.matches() lazy, but I think making it explicit like your
series does is better.
Durham Goode - March 7, 2017, 10:45 p.m.
On 3/7/17 2:29 PM, Martin von Zweigbergk wrote:
> On Tue, Mar 7, 2017 at 9:37 AM, Durham Goode <durham@fb.com> wrote:
>>
>>
>> On 3/6/17 10:31 PM, Martin von Zweigbergk wrote:
>>>
>>> On Mon, Mar 6, 2017 at 5:40 PM, Durham Goode <durham@fb.com> wrote:
>>>>
>>>>
>>>>
>>>> On 3/6/17 5:32 PM, Martin von Zweigbergk wrote:
>>>>>
>>>>>
>>>>> On Fri, Mar 3, 2017 at 11:34 AM, Durham Goode <durham@fb.com> wrote:
>>>>>>
>>>>>>
>>>>>> # HG changeset patch
>>>>>> # User Durham Goode <durham@fb.com>
>>>>>> # Date 1488517595 28800
>>>>>> #      Thu Mar 02 21:06:35 2017 -0800
>>>>>> # Node ID 1bed8ee65389133923b9076909e04f66ef26f1b8
>>>>>> # Parent  207763e895c7d24885df22f5b9c0df5494d77daf
>>>>>> manifest: rename matches to _matches
>>>>>>
>>>>>> Now that there are no external consumers of manifest.matches, let's
>>>>>> rename it to
>>>>>> _matches so it remains internal.
>
> treemanifest already had a _matches(), so now it has two...
>
>>>>>> This means alternative manifest
>>>>>> implementations
>>>>>> no longer have to implement it, and can instead implement efficient
>>>>>> versions of
>>>>>> diff() and filesnotin().
>>>>>
>>>>>
>>>>>
>>>>> matches() still seems like a sensible method. Once "hg files" gains a
>>>>> "-v" or "--flags" option so it can really replace "hg manifest" (or if
>>>>> "hg manifest" started accepting a file pattern), it seems natural to
>>>>> implement that using manifest.matches(). Would it be a problem to
>>>>> queue patches 1-5 but not this one?
>>>>
>>>>
>>>>
>>>> It's not a problem to queue 1-5 without 6, but given the performance
>>>> criticalness of the manifest, I'd lean towards not giving it functions
>>>> that
>>>> could enable future bad algorithms.  If we need the ability to iterate
>>>> over
>>>> the entire manifest in the future, maybe we could add a
>>>> manifest.walk(match=None) function that returns an iterator.
>>>
>>>
>>> That already exists, but it walks the filenames only, not the
>>> (filename,node,flags) tuples :-) But that's besides the point, I know;
>>> we'd just need to find a different name.
>>>
>>>> The 'walk()'
>>>> part exposes caller to the fact that it is going to be an expensive
>>>> operation, and the iterator aspect means we don't have to allocate an
>>>> entire
>>>> dict and we don't have to worry about people abusing it as a way of doing
>>>> naive set math using manifests.  That gets the manifest a little closer
>>>> to
>>>> resembling a dirstate as well.
>>>
>>>
>>> That makes sense. However, there's no such warning to users who do
>>> *not* use a matcher. I think I took care of all those cases two years
>>> ago, but if we want to make it less likely that someone does
>>> set(mf.match(m)), then it seems even more important to prevent
>>> set(mf).  So isn't it __iter__ and friends we should make explicit?
>>
>>
>> Yep, we should probably head in the direction of making all full iteration
>> operations explicit.
>>
>>> I'd also like to be able to verify that this is going in the right
>>> direction. Could you share the remainder of the series too (as a
>>> link)? Thanks.
>>
>>
>> The only new commits on top of this so far are to change treemanifest.diff()
>> to actually limit it's diff based on the visitdir() responses of the
>> matcher.  You can see it here: https://urldefense.proofpoint.com/v2/url?u=https-3A__bpaste.net_show_cba5f57820f1&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=nuarHzhP1wi1T9iURRCj1A&m=uvgL5NLjKD18j2hB6yeJEE-7YJg1wLpx_R1fsWaDkwg&s=oWl_cfGafh-_1LwCNrhAraVcMcVOLDKn3rcReKm5BY4&e=  (two
>> patches, one to update tests, one to adjust the diff implementation. I'm
>> working on migrating our internal treemanifest imlpementation to use this
>> pattern now, so I can get perf numbers. Then I'll come back and do the same
>> for treemanifest.filesnotin()
>
> I'd really like to see the performance gain from this series, since
> that's the motivating force behind it. I've spent some time trying it
> myself. I added these two lines in treemanifest.diff._diff():
>
>             if match and not match.visitdir(t1._dir or '.'):
>                 return
>
> IIUC, the point of the series is to make diffs and status calls
> (across revisions) faster when there are many matching files but a
> small diff. I tried "time hg st --rev .~1 --rev . js" on some
> arbitrary commit I picked from a treemanifest version of the Mozilla
> repo. That speeds up from 1.3s to 0.18s. Nice! It's such a simple fix
> to get that speedup, so can I get you to include it in the series? I
> would have preferred to see something like:

Is the treemanifest version of mozilla available somewhere? Or would I 
need to convert it?

I can attach my two remaining patches (tests + diff optimization) to 
this series if you want. The two lines you added aren't quite enough to 
be a complete patch (t1._dir isn't the full path, and we need to handle 
file matching as well).

> 1. add match argument to diff() and filesnotin() and optimize them
> 2. make copies use new manifest.diff() api and see how it's faster
> 3. same for merge
> 4. ...
>
> That would make it an easier sell for me. But again, what I cared most
> about was data to back up your claim, so I'd really like at least to
> have that part of the series.

I went with this approach because it put all the easy, 
non-logic-affecting changes at the front for easy queueing. That put the 
least blockers between the refactor getting into core and me being able 
to make the needed changes to our internal treemanifest to get perf 
numbers on our repo, since I don't have a large vanilla treemanifest to 
test against. I assumed the promised O() changes were sufficient to 
justify the refactor at least. Then I'd provide perf numbers when 
sending the optimization patches.

> Btw, an alternative approach would have been to make
> manifest.matches() lazy, but I think making it explicit like your
> series does is better.

I considered that, but came to the same conclusion. And the code for a 
lazy one would be a lot more intricate I think and I'd have to replicate 
that in our native treemanifest.
via Mercurial-devel - March 7, 2017, 11:16 p.m.
On Tue, Mar 7, 2017 at 2:45 PM, Durham Goode <durham@fb.com> wrote:
>
>
> On 3/7/17 2:29 PM, Martin von Zweigbergk wrote:
>>
>> On Tue, Mar 7, 2017 at 9:37 AM, Durham Goode <durham@fb.com> wrote:
>>>
>>>
>>>
>>> On 3/6/17 10:31 PM, Martin von Zweigbergk wrote:
>>>>
>>>>
>>>> On Mon, Mar 6, 2017 at 5:40 PM, Durham Goode <durham@fb.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> On 3/6/17 5:32 PM, Martin von Zweigbergk wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On Fri, Mar 3, 2017 at 11:34 AM, Durham Goode <durham@fb.com> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> # HG changeset patch
>>>>>>> # User Durham Goode <durham@fb.com>
>>>>>>> # Date 1488517595 28800
>>>>>>> #      Thu Mar 02 21:06:35 2017 -0800
>>>>>>> # Node ID 1bed8ee65389133923b9076909e04f66ef26f1b8
>>>>>>> # Parent  207763e895c7d24885df22f5b9c0df5494d77daf
>>>>>>> manifest: rename matches to _matches
>>>>>>>
>>>>>>> Now that there are no external consumers of manifest.matches, let's
>>>>>>> rename it to
>>>>>>> _matches so it remains internal.
>>
>>
>> treemanifest already had a _matches(), so now it has two...
>>
>>>>>>> This means alternative manifest
>>>>>>> implementations
>>>>>>> no longer have to implement it, and can instead implement efficient
>>>>>>> versions of
>>>>>>> diff() and filesnotin().
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> matches() still seems like a sensible method. Once "hg files" gains a
>>>>>> "-v" or "--flags" option so it can really replace "hg manifest" (or if
>>>>>> "hg manifest" started accepting a file pattern), it seems natural to
>>>>>> implement that using manifest.matches(). Would it be a problem to
>>>>>> queue patches 1-5 but not this one?
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> It's not a problem to queue 1-5 without 6, but given the performance
>>>>> criticalness of the manifest, I'd lean towards not giving it functions
>>>>> that
>>>>> could enable future bad algorithms.  If we need the ability to iterate
>>>>> over
>>>>> the entire manifest in the future, maybe we could add a
>>>>> manifest.walk(match=None) function that returns an iterator.
>>>>
>>>>
>>>>
>>>> That already exists, but it walks the filenames only, not the
>>>> (filename,node,flags) tuples :-) But that's besides the point, I know;
>>>> we'd just need to find a different name.
>>>>
>>>>> The 'walk()'
>>>>> part exposes caller to the fact that it is going to be an expensive
>>>>> operation, and the iterator aspect means we don't have to allocate an
>>>>> entire
>>>>> dict and we don't have to worry about people abusing it as a way of
>>>>> doing
>>>>> naive set math using manifests.  That gets the manifest a little closer
>>>>> to
>>>>> resembling a dirstate as well.
>>>>
>>>>
>>>>
>>>> That makes sense. However, there's no such warning to users who do
>>>> *not* use a matcher. I think I took care of all those cases two years
>>>> ago, but if we want to make it less likely that someone does
>>>> set(mf.match(m)), then it seems even more important to prevent
>>>> set(mf).  So isn't it __iter__ and friends we should make explicit?
>>>
>>>
>>>
>>> Yep, we should probably head in the direction of making all full
>>> iteration
>>> operations explicit.
>>>
>>>> I'd also like to be able to verify that this is going in the right
>>>> direction. Could you share the remainder of the series too (as a
>>>> link)? Thanks.
>>>
>>>
>>>
>>> The only new commits on top of this so far are to change
>>> treemanifest.diff()
>>> to actually limit it's diff based on the visitdir() responses of the
>>> matcher.  You can see it here:
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__bpaste.net_show_cba5f57820f1&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=nuarHzhP1wi1T9iURRCj1A&m=uvgL5NLjKD18j2hB6yeJEE-7YJg1wLpx_R1fsWaDkwg&s=oWl_cfGafh-_1LwCNrhAraVcMcVOLDKn3rcReKm5BY4&e=
>>> (two
>>> patches, one to update tests, one to adjust the diff implementation. I'm
>>> working on migrating our internal treemanifest imlpementation to use this
>>> pattern now, so I can get perf numbers. Then I'll come back and do the
>>> same
>>> for treemanifest.filesnotin()
>>
>>
>> I'd really like to see the performance gain from this series, since
>> that's the motivating force behind it. I've spent some time trying it
>> myself. I added these two lines in treemanifest.diff._diff():
>>
>>             if match and not match.visitdir(t1._dir or '.'):
>>                 return
>>
>> IIUC, the point of the series is to make diffs and status calls
>> (across revisions) faster when there are many matching files but a
>> small diff. I tried "time hg st --rev .~1 --rev . js" on some
>> arbitrary commit I picked from a treemanifest version of the Mozilla
>> repo. That speeds up from 1.3s to 0.18s. Nice! It's such a simple fix
>> to get that speedup, so can I get you to include it in the series? I
>> would have preferred to see something like:
>
>
> Is the treemanifest version of mozilla available somewhere? Or would I need
> to convert it?

Unfortunately not, and I don't know of an easy way of sharing it.

>
> I can attach my two remaining patches (tests + diff optimization) to this
> series if you want.

Yes, please!

> The two lines you added aren't quite enough to be a
> complete patch (t1._dir isn't the full path, and we need to handle file
> matching as well).
>
>> 1. add match argument to diff() and filesnotin() and optimize them
>> 2. make copies use new manifest.diff() api and see how it's faster
>> 3. same for merge
>> 4. ...
>>
>> That would make it an easier sell for me. But again, what I cared most
>> about was data to back up your claim, so I'd really like at least to
>> have that part of the series.
>
>
> I went with this approach because it put all the easy, non-logic-affecting
> changes at the front for easy queueing. That put the least blockers between
> the refactor getting into core and me being able to make the needed changes
> to our internal treemanifest to get perf numbers on our repo, since I don't
> have a large vanilla treemanifest to test against.

Oh, I had thought you would be able to test it on some large, internal
FB repo. Here's the repo I used:
https://drive.google.com/file/d/0B-XY4343--fQOXktQzdaTjlGcG8/view?usp=sharing

> I assumed the promised
> O() changes were sufficient to justify the refactor at least. Then I'd
> provide perf numbers when sending the optimization patches.

I agreed that they they made sense, but I always like to have numbers
to see that my intuition (and yours in this case) is correct in
practice.

>
>> Btw, an alternative approach would have been to make
>> manifest.matches() lazy, but I think making it explicit like your
>> series does is better.
>
>
> I considered that, but came to the same conclusion. And the code for a lazy
> one would be a lot more intricate I think and I'd have to replicate that in
> our native treemanifest.

Patch

diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -448,8 +448,8 @@  class manifestdict(object):
     def filesnotin(self, m2, match=None):
         '''Set of files in this manifest that are not in the other'''
         if match:
-            m1 = self.matches(match)
-            m2 = m2.matches(match)
+            m1 = self._matches(match)
+            m2 = m2._matches(match)
             return m1.filesnotin(m2)
         diff = self.diff(m2)
         files = set(filepath
@@ -510,7 +510,7 @@  class manifestdict(object):
             if not self.hasdir(fn):
                 match.bad(fn, None)
 
-    def matches(self, match):
+    def _matches(self, match):
         '''generate a new manifest filtered by the match argument'''
         if match.always():
             return self.copy()
@@ -543,8 +543,8 @@  class manifestdict(object):
         string.
         '''
         if match:
-            m1 = self.matches(match)
-            m2 = m2.matches(match)
+            m1 = self._matches(match)
+            m2 = m2._matches(match)
             return m1.diff(m2, clean=clean)
         return self._lm.diff(m2._lm, clean)
 
@@ -917,8 +917,8 @@  class treemanifest(object):
     def filesnotin(self, m2, match=None):
         '''Set of files in this manifest that are not in the other'''
         if match:
-            m1 = self.matches(match)
-            m2 = m2.matches(match)
+            m1 = self._matches(match)
+            m2 = m2._matches(match)
             return m1.filesnotin(m2)
 
         files = set()
@@ -1002,7 +1002,7 @@  class treemanifest(object):
                 for f in self._dirs[p]._walk(match):
                     yield f
 
-    def matches(self, match):
+    def _matches(self, match):
         '''generate a new manifest filtered by the match argument'''
         if match.always():
             return self.copy()
@@ -1054,8 +1054,8 @@  class treemanifest(object):
         string.
         '''
         if match:
-            m1 = self.matches(match)
-            m2 = m2.matches(match)
+            m1 = self._matches(match)
+            m2 = m2._matches(match)
             return m1.diff(m2, clean=clean)
         result = {}
         emptytree = treemanifest()
diff --git a/tests/test-manifest.py b/tests/test-manifest.py
--- a/tests/test-manifest.py
+++ b/tests/test-manifest.py
@@ -233,7 +233,7 @@  class basemanifesttests(object):
         self.assertEqual(want, m['foo'])
         # make sure the suffix survives a copy
         match = matchmod.match('', '', ['re:foo'])
-        m2 = m.matches(match)
+        m2 = m._matches(match)
         self.assertEqual(want, m2['foo'])
         self.assertEqual(1, len(m2))
         m2 = m.copy()
@@ -255,7 +255,7 @@  class basemanifesttests(object):
                 assert False
             return True
         match.matchfn = filt
-        self.assertRaises(AssertionError, m.matches, match)
+        self.assertRaises(AssertionError, m._matches, match)
 
     def testRemoveItem(self):
         m = self.parsemanifest(A_SHORT_MANIFEST)
@@ -358,7 +358,7 @@  class basemanifesttests(object):
 
         match = matchmod.match('/', '',
                 ['file1', 'file200', 'file300'], exact=True)
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         w = ('file1\0%sx\n'
              'file200\0%sl\n'
@@ -374,7 +374,7 @@  class basemanifesttests(object):
         match = matchmod.match('/', '',
                 ['a/b/c/bar.txt', 'a/b/d/qux.py', 'readme.txt', 'nonexistent'],
                 exact=True)
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         self.assertEqual(
                 ['a/b/c/bar.txt', 'a/b/d/qux.py', 'readme.txt'],
@@ -386,7 +386,7 @@  class basemanifesttests(object):
         m = self.parsemanifest(A_DEEPER_MANIFEST)
 
         match = matchmod.match('/', '', ['a/f'], default='relpath')
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         self.assertEqual([], m2.keys())
 
@@ -397,7 +397,7 @@  class basemanifesttests(object):
 
         flist = m.keys()[80:300]
         match = matchmod.match('/', '', flist, exact=True)
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         self.assertEqual(flist, m2.keys())
 
@@ -406,7 +406,7 @@  class basemanifesttests(object):
         m = self.parsemanifest(A_DEEPER_MANIFEST)
 
         match = matchmod.match('/', '', [''])
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         self.assertEqual(m.keys(), m2.keys())
 
@@ -416,7 +416,7 @@  class basemanifesttests(object):
         m = self.parsemanifest(A_DEEPER_MANIFEST)
 
         match = matchmod.match('/', '', ['a/b'], default='relpath')
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         self.assertEqual([
             'a/b/c/bar.py', 'a/b/c/bar.txt', 'a/b/c/foo.py', 'a/b/c/foo.txt',
@@ -430,7 +430,7 @@  class basemanifesttests(object):
         m = self.parsemanifest(A_DEEPER_MANIFEST)
 
         match = matchmod.match('/', '', ['a/b'], exact=True)
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         self.assertEqual([], m2.keys())
 
@@ -440,7 +440,7 @@  class basemanifesttests(object):
         m = self.parsemanifest(A_DEEPER_MANIFEST)
 
         match = matchmod.match('/', 'a/b', ['.'], default='relpath')
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         self.assertEqual([
             'a/b/c/bar.py', 'a/b/c/bar.txt', 'a/b/c/foo.py', 'a/b/c/foo.txt',
@@ -453,7 +453,7 @@  class basemanifesttests(object):
         m = self.parsemanifest(A_DEEPER_MANIFEST)
 
         match = matchmod.match('/', '', ['a/b/*/*.txt'])
-        m2 = m.matches(match)
+        m2 = m._matches(match)
 
         self.assertEqual(
                 ['a/b/c/bar.txt', 'a/b/c/foo.txt', 'a/b/d/ten.txt'],