Patchwork [10,of,10,V2] treemanifest: optimize diff using the matcher

login
register
mail settings
Submitter Durham Goode
Date March 8, 2017, 3:22 a.m.
Message ID <541bf866729342f534ba.1488943362@dev111.prn1.facebook.com>
Download mbox | patch
Permalink /patch/18985/
State Changes Requested
Delegated to: Martin von Zweigbergk
Headers show

Comments

Durham Goode - March 8, 2017, 3:22 a.m.
# HG changeset patch
# User Durham Goode <durham@fb.com>
# Date 1488943242 28800
#      Tue Mar 07 19:20:42 2017 -0800
# Node ID 541bf866729342f534bac425bd8f01b9fe7564e8
# Parent  611fac63adb09c326912e56df59c828ad12ffd9f
treemanifest: optimize diff using the matcher

This optimizes treemanifest.diff() to limit the tree traversal based on the
provided matcher. According to Martin's testing, `hg status --rev .~1 --rev .
foo/` goes from 1.3s to 0.18s on a tree version of Mozilla central. I'd expect
and even greater saving on larger internal repos at big companies.

A previous patch added test coverage for treemanifest diff with patterns.
via Mercurial-devel - March 8, 2017, 9:01 p.m.
On Tue, Mar 7, 2017 at 7:22 PM, Durham Goode <durham@fb.com> wrote:
> # HG changeset patch
> # User Durham Goode <durham@fb.com>
> # Date 1488943242 28800
> #      Tue Mar 07 19:20:42 2017 -0800
> # Node ID 541bf866729342f534bac425bd8f01b9fe7564e8
> # Parent  611fac63adb09c326912e56df59c828ad12ffd9f
> treemanifest: optimize diff using the matcher
>
> This optimizes treemanifest.diff() to limit the tree traversal based on the
> provided matcher. According to Martin's testing, `hg status --rev .~1 --rev .
> foo/` goes from 1.3s to 0.18s on a tree version of Mozilla central. I'd expect
> and even greater saving on larger internal repos at big companies.
>
> A previous patch added test coverage for treemanifest diff with patterns.
>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -1053,10 +1053,6 @@ class treemanifest(object):
>          the nodeid will be None and the flags will be the empty
>          string.
>          '''
> -        if match:
> -            m1 = self._matches(match)
> -            m2 = m2._matches(match)
> -            return m1.diff(m2, clean=clean)
>          result = {}
>          emptytree = treemanifest()
>          def _diff(t1, t2):
> @@ -1065,26 +1061,31 @@ class treemanifest(object):
>              t1._load()
>              t2._load()
>              for d, m1 in t1._dirs.iteritems():
> -                m2 = t2._dirs.get(d, emptytree)
> -                _diff(m1, m2)
> +                if not match or match.visitdir(os.path.join(t1.dir(), d[:-1])):
> +                    m2 = t2._dirs.get(d, emptytree)
> +                    _diff(m1, m2)
>
>              for d, m2 in t2._dirs.iteritems():
>                  if d not in t1._dirs:
> -                    _diff(emptytree, m2)
> +                    if (not match or match.visitdir(os.path.join(t2.dir(),
> +                                                                 d[:-1]))):
> +                        _diff(emptytree, m2)

Instead of doing it twice here, we can do at once at the top of the
function. That will also handle the (rare) case of an excluded root
directory. It's unclear if the extra function call will be any worse
of doing it that way will be any worse than the extra path joins of
doing it this way, so I'd go with the shorter form. Also, we should
add an "or '.'" to handle the root directory (perhaps we'll change
visitdir some day to use empty string for root directory and to accept
the trailing slash for others). So, in all, something like this at the
top of _diff:

            if match and not match.visitdir(t1.dir()[:-1] or '.'):
                return

That's untested.

Btw (and unrelated to your patches), it looks like "clean" should be
checked in that existing early return at the top of the function,
don't you think?

>
>              for fn, n1 in t1._files.iteritems():
> -                fl1 = t1._flags.get(fn, '')
> -                n2 = t2._files.get(fn, None)
> -                fl2 = t2._flags.get(fn, '')
> -                if n1 != n2 or fl1 != fl2:
> -                    result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2))
> -                elif clean:
> -                    result[t1._subpath(fn)] = None
> +                if not match or match(os.path.join(t1.dir(), fn)):

Copy the _subpath call from below instead of calling join(). Is it
worth caching that call in a variable (that's initially None,
probably)?

> +                    fl1 = t1._flags.get(fn, '')
> +                    n2 = t2._files.get(fn, None)
> +                    fl2 = t2._flags.get(fn, '')
> +                    if n1 != n2 or fl1 != fl2:
> +                        result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2))
> +                    elif clean:
> +                        result[t1._subpath(fn)] = None
>
>              for fn, n2 in t2._files.iteritems():
>                  if fn not in t1._files:
> -                    fl2 = t2._flags.get(fn, '')
> -                    result[t2._subpath(fn)] = ((None, ''), (n2, fl2))
> +                    if not match or match(os.path.join(t2.dir(), fn)):
> +                        fl2 = t2._flags.get(fn, '')
> +                        result[t2._subpath(fn)] = ((None, ''), (n2, fl2))
>
>          _diff(self, m2)
>          return result
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Durham Goode - March 8, 2017, 10:44 p.m.
On 3/8/17 1:01 PM, Martin von Zweigbergk wrote:
> On Tue, Mar 7, 2017 at 7:22 PM, Durham Goode <durham@fb.com> wrote:
>> # HG changeset patch
>> # User Durham Goode <durham@fb.com>
>> # Date 1488943242 28800
>> #      Tue Mar 07 19:20:42 2017 -0800
>> # Node ID 541bf866729342f534bac425bd8f01b9fe7564e8
>> # Parent  611fac63adb09c326912e56df59c828ad12ffd9f
>> treemanifest: optimize diff using the matcher
>>
>> This optimizes treemanifest.diff() to limit the tree traversal based on the
>> provided matcher. According to Martin's testing, `hg status --rev .~1 --rev .
>> foo/` goes from 1.3s to 0.18s on a tree version of Mozilla central. I'd expect
>> and even greater saving on larger internal repos at big companies.
>>
>> A previous patch added test coverage for treemanifest diff with patterns.
>>
>> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
>> --- a/mercurial/manifest.py
>> +++ b/mercurial/manifest.py
>> @@ -1053,10 +1053,6 @@ class treemanifest(object):
>>          the nodeid will be None and the flags will be the empty
>>          string.
>>          '''
>> -        if match:
>> -            m1 = self._matches(match)
>> -            m2 = m2._matches(match)
>> -            return m1.diff(m2, clean=clean)
>>          result = {}
>>          emptytree = treemanifest()
>>          def _diff(t1, t2):
>> @@ -1065,26 +1061,31 @@ class treemanifest(object):
>>              t1._load()
>>              t2._load()
>>              for d, m1 in t1._dirs.iteritems():
>> -                m2 = t2._dirs.get(d, emptytree)
>> -                _diff(m1, m2)
>> +                if not match or match.visitdir(os.path.join(t1.dir(), d[:-1])):
>> +                    m2 = t2._dirs.get(d, emptytree)
>> +                    _diff(m1, m2)
>>
>>              for d, m2 in t2._dirs.iteritems():
>>                  if d not in t1._dirs:
>> -                    _diff(emptytree, m2)
>> +                    if (not match or match.visitdir(os.path.join(t2.dir(),
>> +                                                                 d[:-1]))):
>> +                        _diff(emptytree, m2)
>
> Instead of doing it twice here, we can do at once at the top of the
> function. That will also handle the (rare) case of an excluded root
> directory. It's unclear if the extra function call will be any worse
> of doing it that way will be any worse than the extra path joins of
> doing it this way, so I'd go with the shorter form. Also, we should
> add an "or '.'" to handle the root directory (perhaps we'll change
> visitdir some day to use empty string for root directory and to accept
> the trailing slash for others). So, in all, something like this at the
> top of _diff:
>
>             if match and not match.visitdir(t1.dir()[:-1] or '.'):
>                 return
>
> That's untested.

Will do

> Btw (and unrelated to your patches), it looks like "clean" should be
> checked in that existing early return at the top of the function,
> don't you think?

I think you're right

>>
>>              for fn, n1 in t1._files.iteritems():
>> -                fl1 = t1._flags.get(fn, '')
>> -                n2 = t2._files.get(fn, None)
>> -                fl2 = t2._flags.get(fn, '')
>> -                if n1 != n2 or fl1 != fl2:
>> -                    result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2))
>> -                elif clean:
>> -                    result[t1._subpath(fn)] = None
>> +                if not match or match(os.path.join(t1.dir(), fn)):
>
> Copy the _subpath call from below instead of calling join(). Is it
> worth caching that call in a variable (that's initially None,
> probably)?

Will do

>> +                    fl1 = t1._flags.get(fn, '')
>> +                    n2 = t2._files.get(fn, None)
>> +                    fl2 = t2._flags.get(fn, '')
>> +                    if n1 != n2 or fl1 != fl2:
>> +                        result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2))
>> +                    elif clean:
>> +                        result[t1._subpath(fn)] = None
>>
>>              for fn, n2 in t2._files.iteritems():
>>                  if fn not in t1._files:
>> -                    fl2 = t2._flags.get(fn, '')
>> -                    result[t2._subpath(fn)] = ((None, ''), (n2, fl2))
>> +                    if not match or match(os.path.join(t2.dir(), fn)):
>> +                        fl2 = t2._flags.get(fn, '')
>> +                        result[t2._subpath(fn)] = ((None, ''), (n2, fl2))
>>
>>          _diff(self, m2)
>>          return result
>> _______________________________________________
>> Mercurial-devel mailing list
>> Mercurial-devel@mercurial-scm.org
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mercurial-2Dscm.org_mailman_listinfo_mercurial-2Ddevel&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=nuarHzhP1wi1T9iURRCj1A&m=AkIW-FvzdPLnH5EVSsn9KCAjciMPVvlgSXpEaEnD6Rg&s=R0EKehe5lsFC93IpeE23tC9J2LUiZkEjAmHyAduMcSk&e=

Patch

diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -1053,10 +1053,6 @@  class treemanifest(object):
         the nodeid will be None and the flags will be the empty
         string.
         '''
-        if match:
-            m1 = self._matches(match)
-            m2 = m2._matches(match)
-            return m1.diff(m2, clean=clean)
         result = {}
         emptytree = treemanifest()
         def _diff(t1, t2):
@@ -1065,26 +1061,31 @@  class treemanifest(object):
             t1._load()
             t2._load()
             for d, m1 in t1._dirs.iteritems():
-                m2 = t2._dirs.get(d, emptytree)
-                _diff(m1, m2)
+                if not match or match.visitdir(os.path.join(t1.dir(), d[:-1])):
+                    m2 = t2._dirs.get(d, emptytree)
+                    _diff(m1, m2)
 
             for d, m2 in t2._dirs.iteritems():
                 if d not in t1._dirs:
-                    _diff(emptytree, m2)
+                    if (not match or match.visitdir(os.path.join(t2.dir(),
+                                                                 d[:-1]))):
+                        _diff(emptytree, m2)
 
             for fn, n1 in t1._files.iteritems():
-                fl1 = t1._flags.get(fn, '')
-                n2 = t2._files.get(fn, None)
-                fl2 = t2._flags.get(fn, '')
-                if n1 != n2 or fl1 != fl2:
-                    result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2))
-                elif clean:
-                    result[t1._subpath(fn)] = None
+                if not match or match(os.path.join(t1.dir(), fn)):
+                    fl1 = t1._flags.get(fn, '')
+                    n2 = t2._files.get(fn, None)
+                    fl2 = t2._flags.get(fn, '')
+                    if n1 != n2 or fl1 != fl2:
+                        result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2))
+                    elif clean:
+                        result[t1._subpath(fn)] = None
 
             for fn, n2 in t2._files.iteritems():
                 if fn not in t1._files:
-                    fl2 = t2._flags.get(fn, '')
-                    result[t2._subpath(fn)] = ((None, ''), (n2, fl2))
+                    if not match or match(os.path.join(t2.dir(), fn)):
+                        fl2 = t2._flags.get(fn, '')
+                        result[t2._subpath(fn)] = ((None, ''), (n2, fl2))
 
         _diff(self, m2)
         return result