Patchwork similar: remove caching from the module level

login
register
mail settings
Submitter Sean Farley
Date Jan. 13, 2017, 7:43 p.m.
Message ID <7c54c9524a0954a3bd2f.1484336639@1.0.0.127.in-addr.arpa>
Download mbox | patch
Permalink /patch/18212/
State Accepted
Headers show

Comments

Sean Farley - Jan. 13, 2017, 7:43 p.m.
# HG changeset patch
# User Sean Farley <sean@farley.io>
# Date 1484336556 28800
#      Fri Jan 13 11:42:36 2017 -0800
# Node ID 7c54c9524a0954a3bd2f2fb4451028869f851f6d
# Parent  8540967cd9e0909a9a73dbff458c4cedd4db26aa
similar: remove caching from the module level

To prevent Bad Things™ from happening, let's rework the logic to not use
util.cachefunc.
Pierre-Yves David - Jan. 13, 2017, 10:43 p.m.
On 01/13/2017 08:43 PM, Sean Farley wrote:
> # HG changeset patch
> # User Sean Farley <sean@farley.io>
> # Date 1484336556 28800
> #      Fri Jan 13 11:42:36 2017 -0800
> # Node ID 7c54c9524a0954a3bd2f2fb4451028869f851f6d
> # Parent  8540967cd9e0909a9a73dbff458c4cedd4db26aa
> similar: remove caching from the module level
>
> To prevent Bad Things™ from happening, let's rework the logic to not use
> util.cachefunc.

Looks good to me, but… since this is my code, that should probably have 
my name of it ;-)

https://www.mercurial-scm.org/pipermail/mercurial-devel/2017-January/092290.html

(I saw with Sean on IRC and he is okay with changing the name) Can 
another reviewer fix this in flight ?

> diff --git a/mercurial/similar.py b/mercurial/similar.py
> --- a/mercurial/similar.py
> +++ b/mercurial/similar.py
> @@ -11,11 +11,10 @@ import hashlib
>
>  from .i18n import _
>  from . import (
>      bdiff,
>      mdiff,
> -    util,
>  )
>
>  def _findexactmatches(repo, added, removed):
>      '''find renamed files that have no changes
>
> @@ -41,20 +40,18 @@ def _findexactmatches(repo, added, remov
>              yield (hashes[h], fctx)
>
>      # Done
>      repo.ui.progress(_('searching for exact renames'), None)
>
> -@util.cachefunc
>  def _ctxdata(fctx):
>      # lazily load text
>      orig = fctx.data()
>      return orig, mdiff.splitnewlines(orig)
>
> -@util.cachefunc
> -def score(fctx1, fctx2):
> -    text = fctx1.data()
> -    orig, lines = _ctxdata(fctx2)
> +def _score(fctx, otherdata):
> +    orig, lines = otherdata
> +    text = fctx.data()
>      # bdiff.blocks() returns blocks of matching lines
>      # count the number of bytes in each
>      equal = 0
>      matches = bdiff.blocks(text, orig)
>      for x1, x2, y1, y2 in matches:
> @@ -62,10 +59,13 @@ def score(fctx1, fctx2):
>              equal += len(line)
>
>      lengths = len(text) + len(orig)
>      return equal * 2.0 / lengths
>
> +def score(fctx1, fctx2):
> +    return _score(fctx1, _ctxdata(fctx2))
> +
>  def _findsimilarmatches(repo, added, removed, threshold):
>      '''find potentially renamed files based on similar file content
>
>      Takes a list of new filectxs and a list of removed filectxs, and yields
>      (before, after, score) tuples of partial matches.
> @@ -73,13 +73,16 @@ def _findsimilarmatches(repo, added, rem
>      copies = {}
>      for i, r in enumerate(removed):
>          repo.ui.progress(_('searching for similar files'), i,
>                           total=len(removed), unit=_('files'))
>
> +        data = None
>          for a in added:
>              bestscore = copies.get(a, (None, threshold))[1]
> -            myscore = score(a, r)
> +            if data is None:
> +                data = _ctxdata(r)
> +            myscore = score(a, data)
>              if myscore >= bestscore:
>                  copies[a] = (r, myscore)
>      repo.ui.progress(_('searching'), None)
>
>      for dest, v in copies.iteritems():
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>
Augie Fackler - Jan. 14, 2017, 4:26 a.m.
> On Jan 13, 2017, at 5:43 PM, Pierre-Yves David <pierre-yves.david@ens-lyon.org> wrote:
> 
> 
> 
> On 01/13/2017 08:43 PM, Sean Farley wrote:
>> # HG changeset patch
>> # User Sean Farley <sean@farley.io>
>> # Date 1484336556 28800
>> #      Fri Jan 13 11:42:36 2017 -0800
>> # Node ID 7c54c9524a0954a3bd2f2fb4451028869f851f6d
>> # Parent  8540967cd9e0909a9a73dbff458c4cedd4db26aa
>> similar: remove caching from the module level
>> 
>> To prevent Bad Things™ from happening, let's rework the logic to not use
>> util.cachefunc.
> 
> Looks good to me, but… since this is my code, that should probably have my name of it ;-)
> 
> https://www.mercurial-scm.org/pipermail/mercurial-devel/2017-January/092290.html
> 
> (I saw with Sean on IRC and he is okay with changing the name) Can another reviewer fix this in flight ?

Looks like martinvonz queued this as 86145 (I don’t see a mail from him, just closing the loop.)
via Mercurial-devel - Jan. 14, 2017, 5:07 a.m.
Oops, I apparently replied only to Pierre-Yves. I'll forward that to
everyone for the record. Thanks for noting.

On Fri, Jan 13, 2017, 20:26 Augie Fackler <raf@durin42.com> wrote:

>
> > On Jan 13, 2017, at 5:43 PM, Pierre-Yves David <
> pierre-yves.david@ens-lyon.org> wrote:
> >
> >
> >
> > On 01/13/2017 08:43 PM, Sean Farley wrote:
> >> # HG changeset patch
> >> # User Sean Farley <sean@farley.io>
> >> # Date 1484336556 28800
> >> #      Fri Jan 13 11:42:36 2017 -0800
> >> # Node ID 7c54c9524a0954a3bd2f2fb4451028869f851f6d
> >> # Parent  8540967cd9e0909a9a73dbff458c4cedd4db26aa
> >> similar: remove caching from the module level
> >>
> >> To prevent Bad Things™ from happening, let's rework the logic to not use
> >> util.cachefunc.
> >
> > Looks good to me, but… since this is my code, that should probably have
> my name of it ;-)
> >
> >
> https://www.mercurial-scm.org/pipermail/mercurial-devel/2017-January/092290.html
> >
> > (I saw with Sean on IRC and he is okay with changing the name) Can
> another reviewer fix this in flight ?
>
> Looks like martinvonz queued this as 86145 (I don’t see a mail from him,
> just closing the loop.)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>
via Mercurial-devel - Jan. 14, 2017, 5:08 a.m.
---------- Forwarded message ---------
From: Martin von Zweigbergk <martinvonz@google.com>
Date: Fri, Jan 13, 2017, 16:19
Subject: Re: [PATCH] similar: remove caching from the module level
To: Pierre-Yves David <pierre-yves.david@ens-lyon.org>


On Fri, Jan 13, 2017 at 2:43 PM, Pierre-Yves David
<pierre-yves.david@ens-lyon.org> wrote:
>
>
> On 01/13/2017 08:43 PM, Sean Farley wrote:
>>
>> # HG changeset patch
>> # User Sean Farley <sean@farley.io>
>> # Date 1484336556 28800
>> #      Fri Jan 13 11:42:36 2017 -0800
>> # Node ID 7c54c9524a0954a3bd2f2fb4451028869f851f6d
>> # Parent  8540967cd9e0909a9a73dbff458c4cedd4db26aa
>> similar: remove caching from the module level
>>
>> To prevent Bad Things™ from happening, let's rework the logic to not use
>> util.cachefunc.

Queued with author changed to Pierre-Yves and with a bug fixed.

>
>
> Looks good to me, but… since this is my code, that should probably have my
> name of it ;-)
>
>
https://www.mercurial-scm.org/pipermail/mercurial-devel/2017-January/092290.html
>
> (I saw with Sean on IRC and he is okay with changing the name) Can another
> reviewer fix this in flight ?
>
>
>> diff --git a/mercurial/similar.py b/mercurial/similar.py
>> --- a/mercurial/similar.py
>> +++ b/mercurial/similar.py
>> @@ -11,11 +11,10 @@ import hashlib
>>
>>  from .i18n import _
>>  from . import (
>>      bdiff,
>>      mdiff,
>> -    util,
>>  )
>>
>>  def _findexactmatches(repo, added, removed):
>>      '''find renamed files that have no changes
>>
>> @@ -41,20 +40,18 @@ def _findexactmatches(repo, added, remov
>>              yield (hashes[h], fctx)
>>
>>      # Done
>>      repo.ui.progress(_('searching for exact renames'), None)
>>
>> -@util.cachefunc
>>  def _ctxdata(fctx):
>>      # lazily load text
>>      orig = fctx.data()
>>      return orig, mdiff.splitnewlines(orig)
>>
>> -@util.cachefunc
>> -def score(fctx1, fctx2):
>> -    text = fctx1.data()
>> -    orig, lines = _ctxdata(fctx2)
>> +def _score(fctx, otherdata):
>> +    orig, lines = otherdata
>> +    text = fctx.data()
>>      # bdiff.blocks() returns blocks of matching lines
>>      # count the number of bytes in each
>>      equal = 0
>>      matches = bdiff.blocks(text, orig)
>>      for x1, x2, y1, y2 in matches:
>> @@ -62,10 +59,13 @@ def score(fctx1, fctx2):
>>              equal += len(line)
>>
>>      lengths = len(text) + len(orig)
>>      return equal * 2.0 / lengths
>>
>> +def score(fctx1, fctx2):
>> +    return _score(fctx1, _ctxdata(fctx2))
>> +
>>  def _findsimilarmatches(repo, added, removed, threshold):
>>      '''find potentially renamed files based on similar file content
>>
>>      Takes a list of new filectxs and a list of removed filectxs, and
>> yields
>>      (before, after, score) tuples of partial matches.
>> @@ -73,13 +73,16 @@ def _findsimilarmatches(repo, added, rem
>>      copies = {}
>>      for i, r in enumerate(removed):
>>          repo.ui.progress(_('searching for similar files'), i,
>>                           total=len(removed), unit=_('files'))
>>
>> +        data = None
>>          for a in added:
>>              bestscore = copies.get(a, (None, threshold))[1]
>> -            myscore = score(a, r)
>> +            if data is None:
>> +                data = _ctxdata(r)
>> +            myscore = score(a, data)

Tests didn't like this. Changed to _score().

>>              if myscore >= bestscore:
>>                  copies[a] = (r, myscore)
>>      repo.ui.progress(_('searching'), None)
>>
>>      for dest, v in copies.iteritems():
>> _______________________________________________
>> Mercurial-devel mailing list
>> Mercurial-devel@mercurial-scm.org
>> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>>
>
> --
> Pierre-Yves David
>
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel

Patch

diff --git a/mercurial/similar.py b/mercurial/similar.py
--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -11,11 +11,10 @@  import hashlib
 
 from .i18n import _
 from . import (
     bdiff,
     mdiff,
-    util,
 )
 
 def _findexactmatches(repo, added, removed):
     '''find renamed files that have no changes
 
@@ -41,20 +40,18 @@  def _findexactmatches(repo, added, remov
             yield (hashes[h], fctx)
 
     # Done
     repo.ui.progress(_('searching for exact renames'), None)
 
-@util.cachefunc
 def _ctxdata(fctx):
     # lazily load text
     orig = fctx.data()
     return orig, mdiff.splitnewlines(orig)
 
-@util.cachefunc
-def score(fctx1, fctx2):
-    text = fctx1.data()
-    orig, lines = _ctxdata(fctx2)
+def _score(fctx, otherdata):
+    orig, lines = otherdata
+    text = fctx.data()
     # bdiff.blocks() returns blocks of matching lines
     # count the number of bytes in each
     equal = 0
     matches = bdiff.blocks(text, orig)
     for x1, x2, y1, y2 in matches:
@@ -62,10 +59,13 @@  def score(fctx1, fctx2):
             equal += len(line)
 
     lengths = len(text) + len(orig)
     return equal * 2.0 / lengths
 
+def score(fctx1, fctx2):
+    return _score(fctx1, _ctxdata(fctx2))
+
 def _findsimilarmatches(repo, added, removed, threshold):
     '''find potentially renamed files based on similar file content
 
     Takes a list of new filectxs and a list of removed filectxs, and yields
     (before, after, score) tuples of partial matches.
@@ -73,13 +73,16 @@  def _findsimilarmatches(repo, added, rem
     copies = {}
     for i, r in enumerate(removed):
         repo.ui.progress(_('searching for similar files'), i,
                          total=len(removed), unit=_('files'))
 
+        data = None
         for a in added:
             bestscore = copies.get(a, (None, threshold))[1]
-            myscore = score(a, r)
+            if data is None:
+                data = _ctxdata(r)
+            myscore = score(a, data)
             if myscore >= bestscore:
                 copies[a] = (r, myscore)
     repo.ui.progress(_('searching'), None)
 
     for dest, v in copies.iteritems():