Patchwork [5,of,8,git-diff] similar: move score function to module level

login
register
mail settings
Submitter Sean Farley
Date Jan. 9, 2017, 7:49 p.m.
Message ID <8561e240f25e851476ab.1483991378@1.0.0.127.in-addr.arpa>
Download mbox | patch
Permalink /patch/18150/
State Accepted
Headers show

Comments

Sean Farley - Jan. 9, 2017, 7:49 p.m.
# HG changeset patch
# User Sean Farley <sean@farley.io>
# Date 1483850877 28800
#      Sat Jan 07 20:47:57 2017 -0800
# Node ID 8561e240f25e851476abde452b850fc09a7fdf06
# Parent  9af4ef2f80a8c5cb69ad4ff5d291ce81a35e4f39
similar: move score function to module level

Future patches will use this to report the similarity of a rename / copy
in the patch output.

Patch

diff --git a/mercurial/similar.py b/mercurial/similar.py
--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -41,10 +41,31 @@  def _findexactmatches(repo, added, remov
             yield (hashes[h], fctx)
 
     # Done
     repo.ui.progress(_('searching for exact renames'), None)
 
+@util.cachefunc
+def _ctxdata(fctx):
+    # lazily load text
+    orig = fctx.data()
+    return orig, mdiff.splitnewlines(orig)
+
+@util.cachefunc
+def score(fctx1, fctx2):
+    text = fctx1.data()
+    orig, lines = _ctxdata(fctx2)
+    # bdiff.blocks() returns blocks of matching lines
+    # count the number of bytes in each
+    equal = 0
+    matches = bdiff.blocks(text, orig)
+    for x1, x2, y1, y2 in matches:
+        for line in lines[y1:y2]:
+            equal += len(line)
+
+    lengths = len(text) + len(orig)
+    return equal * 2.0 / lengths
+
 def _findsimilarmatches(repo, added, removed, threshold):
     '''find potentially renamed files based on similar file content
 
     Takes a list of new filectxs and a list of removed filectxs, and yields
     (before, after, score) tuples of partial matches.
@@ -52,32 +73,13 @@  def _findsimilarmatches(repo, added, rem
     copies = {}
     for i, r in enumerate(removed):
         repo.ui.progress(_('searching for similar files'), i,
                          total=len(removed), unit=_('files'))
 
-        # lazily load text
-        @util.cachefunc
-        def data():
-            orig = r.data()
-            return orig, mdiff.splitnewlines(orig)
-
-        def score(text):
-            orig, lines = data()
-            # bdiff.blocks() returns blocks of matching lines
-            # count the number of bytes in each
-            equal = 0
-            matches = bdiff.blocks(text, orig)
-            for x1, x2, y1, y2 in matches:
-                for line in lines[y1:y2]:
-                    equal += len(line)
-
-            lengths = len(text) + len(orig)
-            return equal * 2.0 / lengths
-
         for a in added:
             bestscore = copies.get(a, (None, threshold))[1]
-            myscore = score(a.data())
+            myscore = score(a, r)
             if myscore >= bestscore:
                 copies[a] = (r, myscore)
     repo.ui.progress(_('searching'), None)
 
     for dest, v in copies.iteritems():