Submitter | Durham Goode |
---|---|
Date | March 8, 2017, 3:22 a.m. |
Message ID | <541bf866729342f534ba.1488943362@dev111.prn1.facebook.com> |
Download | mbox | patch |
Permalink | /patch/18985/ |
State | Changes Requested |
Delegated to: | Martin von Zweigbergk |
Headers | show |
Comments
On Tue, Mar 7, 2017 at 7:22 PM, Durham Goode <durham@fb.com> wrote: > # HG changeset patch > # User Durham Goode <durham@fb.com> > # Date 1488943242 28800 > # Tue Mar 07 19:20:42 2017 -0800 > # Node ID 541bf866729342f534bac425bd8f01b9fe7564e8 > # Parent 611fac63adb09c326912e56df59c828ad12ffd9f > treemanifest: optimize diff using the matcher > > This optimizes treemanifest.diff() to limit the tree traversal based on the > provided matcher. According to Martin's testing, `hg status --rev .~1 --rev . > foo/` goes from 1.3s to 0.18s on a tree version of Mozilla central. I'd expect > and even greater saving on larger internal repos at big companies. > > A previous patch added test coverage for treemanifest diff with patterns. > > diff --git a/mercurial/manifest.py b/mercurial/manifest.py > --- a/mercurial/manifest.py > +++ b/mercurial/manifest.py > @@ -1053,10 +1053,6 @@ class treemanifest(object): > the nodeid will be None and the flags will be the empty > string. > ''' > - if match: > - m1 = self._matches(match) > - m2 = m2._matches(match) > - return m1.diff(m2, clean=clean) > result = {} > emptytree = treemanifest() > def _diff(t1, t2): > @@ -1065,26 +1061,31 @@ class treemanifest(object): > t1._load() > t2._load() > for d, m1 in t1._dirs.iteritems(): > - m2 = t2._dirs.get(d, emptytree) > - _diff(m1, m2) > + if not match or match.visitdir(os.path.join(t1.dir(), d[:-1])): > + m2 = t2._dirs.get(d, emptytree) > + _diff(m1, m2) > > for d, m2 in t2._dirs.iteritems(): > if d not in t1._dirs: > - _diff(emptytree, m2) > + if (not match or match.visitdir(os.path.join(t2.dir(), > + d[:-1]))): > + _diff(emptytree, m2) Instead of doing it twice here, we can do at once at the top of the function. That will also handle the (rare) case of an excluded root directory. It's unclear if the extra function call will be any worse of doing it that way will be any worse than the extra path joins of doing it this way, so I'd go with the shorter form. Also, we should add an "or '.'" to handle the root directory (perhaps we'll change visitdir some day to use empty string for root directory and to accept the trailing slash for others). So, in all, something like this at the top of _diff: if match and not match.visitdir(t1.dir()[:-1] or '.'): return That's untested. Btw (and unrelated to your patches), it looks like "clean" should be checked in that existing early return at the top of the function, don't you think? > > for fn, n1 in t1._files.iteritems(): > - fl1 = t1._flags.get(fn, '') > - n2 = t2._files.get(fn, None) > - fl2 = t2._flags.get(fn, '') > - if n1 != n2 or fl1 != fl2: > - result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2)) > - elif clean: > - result[t1._subpath(fn)] = None > + if not match or match(os.path.join(t1.dir(), fn)): Copy the _subpath call from below instead of calling join(). Is it worth caching that call in a variable (that's initially None, probably)? > + fl1 = t1._flags.get(fn, '') > + n2 = t2._files.get(fn, None) > + fl2 = t2._flags.get(fn, '') > + if n1 != n2 or fl1 != fl2: > + result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2)) > + elif clean: > + result[t1._subpath(fn)] = None > > for fn, n2 in t2._files.iteritems(): > if fn not in t1._files: > - fl2 = t2._flags.get(fn, '') > - result[t2._subpath(fn)] = ((None, ''), (n2, fl2)) > + if not match or match(os.path.join(t2.dir(), fn)): > + fl2 = t2._flags.get(fn, '') > + result[t2._subpath(fn)] = ((None, ''), (n2, fl2)) > > _diff(self, m2) > return result > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@mercurial-scm.org > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
On 3/8/17 1:01 PM, Martin von Zweigbergk wrote: > On Tue, Mar 7, 2017 at 7:22 PM, Durham Goode <durham@fb.com> wrote: >> # HG changeset patch >> # User Durham Goode <durham@fb.com> >> # Date 1488943242 28800 >> # Tue Mar 07 19:20:42 2017 -0800 >> # Node ID 541bf866729342f534bac425bd8f01b9fe7564e8 >> # Parent 611fac63adb09c326912e56df59c828ad12ffd9f >> treemanifest: optimize diff using the matcher >> >> This optimizes treemanifest.diff() to limit the tree traversal based on the >> provided matcher. According to Martin's testing, `hg status --rev .~1 --rev . >> foo/` goes from 1.3s to 0.18s on a tree version of Mozilla central. I'd expect >> and even greater saving on larger internal repos at big companies. >> >> A previous patch added test coverage for treemanifest diff with patterns. >> >> diff --git a/mercurial/manifest.py b/mercurial/manifest.py >> --- a/mercurial/manifest.py >> +++ b/mercurial/manifest.py >> @@ -1053,10 +1053,6 @@ class treemanifest(object): >> the nodeid will be None and the flags will be the empty >> string. >> ''' >> - if match: >> - m1 = self._matches(match) >> - m2 = m2._matches(match) >> - return m1.diff(m2, clean=clean) >> result = {} >> emptytree = treemanifest() >> def _diff(t1, t2): >> @@ -1065,26 +1061,31 @@ class treemanifest(object): >> t1._load() >> t2._load() >> for d, m1 in t1._dirs.iteritems(): >> - m2 = t2._dirs.get(d, emptytree) >> - _diff(m1, m2) >> + if not match or match.visitdir(os.path.join(t1.dir(), d[:-1])): >> + m2 = t2._dirs.get(d, emptytree) >> + _diff(m1, m2) >> >> for d, m2 in t2._dirs.iteritems(): >> if d not in t1._dirs: >> - _diff(emptytree, m2) >> + if (not match or match.visitdir(os.path.join(t2.dir(), >> + d[:-1]))): >> + _diff(emptytree, m2) > > Instead of doing it twice here, we can do at once at the top of the > function. That will also handle the (rare) case of an excluded root > directory. It's unclear if the extra function call will be any worse > of doing it that way will be any worse than the extra path joins of > doing it this way, so I'd go with the shorter form. Also, we should > add an "or '.'" to handle the root directory (perhaps we'll change > visitdir some day to use empty string for root directory and to accept > the trailing slash for others). So, in all, something like this at the > top of _diff: > > if match and not match.visitdir(t1.dir()[:-1] or '.'): > return > > That's untested. Will do > Btw (and unrelated to your patches), it looks like "clean" should be > checked in that existing early return at the top of the function, > don't you think? I think you're right >> >> for fn, n1 in t1._files.iteritems(): >> - fl1 = t1._flags.get(fn, '') >> - n2 = t2._files.get(fn, None) >> - fl2 = t2._flags.get(fn, '') >> - if n1 != n2 or fl1 != fl2: >> - result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2)) >> - elif clean: >> - result[t1._subpath(fn)] = None >> + if not match or match(os.path.join(t1.dir(), fn)): > > Copy the _subpath call from below instead of calling join(). Is it > worth caching that call in a variable (that's initially None, > probably)? Will do >> + fl1 = t1._flags.get(fn, '') >> + n2 = t2._files.get(fn, None) >> + fl2 = t2._flags.get(fn, '') >> + if n1 != n2 or fl1 != fl2: >> + result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2)) >> + elif clean: >> + result[t1._subpath(fn)] = None >> >> for fn, n2 in t2._files.iteritems(): >> if fn not in t1._files: >> - fl2 = t2._flags.get(fn, '') >> - result[t2._subpath(fn)] = ((None, ''), (n2, fl2)) >> + if not match or match(os.path.join(t2.dir(), fn)): >> + fl2 = t2._flags.get(fn, '') >> + result[t2._subpath(fn)] = ((None, ''), (n2, fl2)) >> >> _diff(self, m2) >> return result >> _______________________________________________ >> Mercurial-devel mailing list >> Mercurial-devel@mercurial-scm.org >> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mercurial-2Dscm.org_mailman_listinfo_mercurial-2Ddevel&d=DwIBaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=nuarHzhP1wi1T9iURRCj1A&m=AkIW-FvzdPLnH5EVSsn9KCAjciMPVvlgSXpEaEnD6Rg&s=R0EKehe5lsFC93IpeE23tC9J2LUiZkEjAmHyAduMcSk&e=
Patch
diff --git a/mercurial/manifest.py b/mercurial/manifest.py --- a/mercurial/manifest.py +++ b/mercurial/manifest.py @@ -1053,10 +1053,6 @@ class treemanifest(object): the nodeid will be None and the flags will be the empty string. ''' - if match: - m1 = self._matches(match) - m2 = m2._matches(match) - return m1.diff(m2, clean=clean) result = {} emptytree = treemanifest() def _diff(t1, t2): @@ -1065,26 +1061,31 @@ class treemanifest(object): t1._load() t2._load() for d, m1 in t1._dirs.iteritems(): - m2 = t2._dirs.get(d, emptytree) - _diff(m1, m2) + if not match or match.visitdir(os.path.join(t1.dir(), d[:-1])): + m2 = t2._dirs.get(d, emptytree) + _diff(m1, m2) for d, m2 in t2._dirs.iteritems(): if d not in t1._dirs: - _diff(emptytree, m2) + if (not match or match.visitdir(os.path.join(t2.dir(), + d[:-1]))): + _diff(emptytree, m2) for fn, n1 in t1._files.iteritems(): - fl1 = t1._flags.get(fn, '') - n2 = t2._files.get(fn, None) - fl2 = t2._flags.get(fn, '') - if n1 != n2 or fl1 != fl2: - result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2)) - elif clean: - result[t1._subpath(fn)] = None + if not match or match(os.path.join(t1.dir(), fn)): + fl1 = t1._flags.get(fn, '') + n2 = t2._files.get(fn, None) + fl2 = t2._flags.get(fn, '') + if n1 != n2 or fl1 != fl2: + result[t1._subpath(fn)] = ((n1, fl1), (n2, fl2)) + elif clean: + result[t1._subpath(fn)] = None for fn, n2 in t2._files.iteritems(): if fn not in t1._files: - fl2 = t2._flags.get(fn, '') - result[t2._subpath(fn)] = ((None, ''), (n2, fl2)) + if not match or match(os.path.join(t2.dir(), fn)): + fl2 = t2._flags.get(fn, '') + result[t2._subpath(fn)] = ((None, ''), (n2, fl2)) _diff(self, m2) return result