Patchwork [6,of,6,V2] similar: use cheaper hash() function to test exact matches

login
register
mail settings
Submitter Yuya Nishihara
Date March 23, 2017, 2:13 p.m.
Message ID <5954caab67d569bc6803.1490278428@mimosa>
Download mbox | patch
Permalink /patch/19604/
State Accepted
Headers show

Comments

Yuya Nishihara - March 23, 2017, 2:13 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1490270247 -32400
#      Thu Mar 23 20:57:27 2017 +0900
# Node ID 5954caab67d569bc680339175b4a12b400b9ad12
# Parent  d628846ce8182338b112820860b3c7fe4dcbc33d
similar: use cheaper hash() function to test exact matches

We just need a hash table {fctx.data(): fctx} which doesn't keep fctx.data()
in memory. Let's simply use hash(fctx.data()) to put data out from memory,
and manage collided fctx objects by list.

This isn't significantly faster than using sha1, but is more correct as we
know SHA-1 collision attack is getting practical.

Benchmark with 50k added/removed files, on tmpfs:

  $ hg addremove --dry-run --time -q

  previous:   real 12.420 secs (user 11.120+0.000 sys 1.280+0.000)
  this patch: real 12.350 secs (user 11.210+0.000 sys 1.140+0.000)
Augie Fackler - March 23, 2017, 3:20 p.m.
On Thu, Mar 23, 2017 at 11:13:48PM +0900, Yuya Nishihara wrote:
> # HG changeset patch
> # User Yuya Nishihara <yuya@tcha.org>
> # Date 1490270247 -32400
> #      Thu Mar 23 20:57:27 2017 +0900
> # Node ID 5954caab67d569bc680339175b4a12b400b9ad12
> # Parent  d628846ce8182338b112820860b3c7fe4dcbc33d
> similar: use cheaper hash() function to test exact matches

Queued these, thanks.

>
> We just need a hash table {fctx.data(): fctx} which doesn't keep fctx.data()
> in memory. Let's simply use hash(fctx.data()) to put data out from memory,
> and manage collided fctx objects by list.
>
> This isn't significantly faster than using sha1, but is more correct as we
> know SHA-1 collision attack is getting practical.
>
> Benchmark with 50k added/removed files, on tmpfs:
>
>   $ hg addremove --dry-run --time -q
>
>   previous:   real 12.420 secs (user 11.120+0.000 sys 1.280+0.000)
>   this patch: real 12.350 secs (user 11.210+0.000 sys 1.140+0.000)
>
> diff --git a/mercurial/similar.py b/mercurial/similar.py
> --- a/mercurial/similar.py
> +++ b/mercurial/similar.py
> @@ -7,8 +7,6 @@
>
>  from __future__ import absolute_import
>
> -import hashlib
> -
>  from .i18n import _
>  from . import (
>      bdiff,
> @@ -23,25 +21,29 @@ def _findexactmatches(repo, added, remov
>      '''
>      numfiles = len(added) + len(removed)
>
> -    # Get hashes of removed files.
> +    # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
> +    # We use hash() to discard fctx.data() from memory.
>      hashes = {}
> -    for i, fctx in enumerate(reversed(removed)):
> +    for i, fctx in enumerate(removed):
>          repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
>                           unit=_('files'))
> -        h = hashlib.sha1(fctx.data()).digest()
> -        hashes[h] = fctx
> +        h = hash(fctx.data())
> +        if h not in hashes:
> +            hashes[h] = [fctx]
> +        else:
> +            hashes[h].append(fctx)
>
>      # For each added file, see if it corresponds to a removed file.
>      for i, fctx in enumerate(added):
>          repo.ui.progress(_('searching for exact renames'), i + len(removed),
>                  total=numfiles, unit=_('files'))
>          adata = fctx.data()
> -        h = hashlib.sha1(adata).digest()
> -        if h in hashes:
> -            rfctx = hashes[h]
> +        h = hash(adata)
> +        for rfctx in hashes.get(h, []):
>              # compare between actual file contents for exact identity
>              if adata == rfctx.data():
>                  yield (rfctx, fctx)
> +                break
>
>      # Done
>      repo.ui.progress(_('searching for exact renames'), None)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel

Patch

diff --git a/mercurial/similar.py b/mercurial/similar.py
--- a/mercurial/similar.py
+++ b/mercurial/similar.py
@@ -7,8 +7,6 @@ 
 
 from __future__ import absolute_import
 
-import hashlib
-
 from .i18n import _
 from . import (
     bdiff,
@@ -23,25 +21,29 @@  def _findexactmatches(repo, added, remov
     '''
     numfiles = len(added) + len(removed)
 
-    # Get hashes of removed files.
+    # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
+    # We use hash() to discard fctx.data() from memory.
     hashes = {}
-    for i, fctx in enumerate(reversed(removed)):
+    for i, fctx in enumerate(removed):
         repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
                          unit=_('files'))
-        h = hashlib.sha1(fctx.data()).digest()
-        hashes[h] = fctx
+        h = hash(fctx.data())
+        if h not in hashes:
+            hashes[h] = [fctx]
+        else:
+            hashes[h].append(fctx)
 
     # For each added file, see if it corresponds to a removed file.
     for i, fctx in enumerate(added):
         repo.ui.progress(_('searching for exact renames'), i + len(removed),
                 total=numfiles, unit=_('files'))
         adata = fctx.data()
-        h = hashlib.sha1(adata).digest()
-        if h in hashes:
-            rfctx = hashes[h]
+        h = hash(adata)
+        for rfctx in hashes.get(h, []):
             # compare between actual file contents for exact identity
             if adata == rfctx.data():
                 yield (rfctx, fctx)
+                break
 
     # Done
     repo.ui.progress(_('searching for exact renames'), None)