Patchwork [5,of,6] convert: add support to detect git renames and copies

login
register
mail settings
Submitter Siddharth Agarwal
Date Sept. 12, 2014, 7:48 p.m.
Message ID <fd79e986ea948d2eae9f.1410551316@devbig136.prn2.facebook.com>
Download mbox | patch
Permalink /patch/5804/
State Accepted
Headers show

Comments

Siddharth Agarwal - Sept. 12, 2014, 7:48 p.m.
# HG changeset patch
# User Siddharth Agarwal <sid0@fb.com>
# Date 1410546206 25200
#      Fri Sep 12 11:23:26 2014 -0700
# Node ID fd79e986ea948d2eae9f532afa309094fe91398e
# Parent  18cda66235b2d689fd98f5b6d8a6dd3fbaa01b70
convert: add support to detect git renames and copies

Git is fairly unique among VCSes in that it doesn't record copies and renames,
instead choosing to detect them on the fly. Since Mercurial expects copies and
renames to be recorded, it can be valuable to preserve this history while
converting a Git repository to Mercurial. This patch adds a new convert option,
called 'convert.git.similarity', which determines how similar files must be to
be treated as renames or copies.

Patch

diff --git a/hgext/convert/__init__.py b/hgext/convert/__init__.py
--- a/hgext/convert/__init__.py
+++ b/hgext/convert/__init__.py
@@ -291,6 +291,15 @@ 
     leading 'refs/heads' stripped. Git submodules are converted to Git
     subrepos in Mercurial.
 
+    The following options can be set with ``--config``:
+
+    :convert.git.similarity: specify how similar files modified in a
+        commit must be to be imported as renames or copies, as a
+        percentage between ``0`` (disabled) and ``100`` (files must be
+        identical). For example, ``90`` means that a delete/add pair will
+        be imported as a rename if more than 90% of the file hasn't
+        changed. The default is ``0``.
+
     Perforce Source
     ###############
 
diff --git a/hgext/convert/git.py b/hgext/convert/git.py
--- a/hgext/convert/git.py
+++ b/hgext/convert/git.py
@@ -94,6 +94,17 @@ 
         if not os.path.exists(path + "/objects"):
             raise NoRepo(_("%s does not look like a Git repository") % path)
 
+        try:
+            similarity = int(ui.config('convert', 'git.similarity') or 0)
+        except ValueError:
+            raise util.Abort('convert.git.similarity must be a number')
+        if similarity < 0 or similarity > 100:
+            raise util.Abort(_('similarity must be between 0 and 100'))
+        if similarity > 0:
+            self.simopt = '--find-copies=%d%%' % similarity
+        else:
+            self.simopt = ''
+
         checktool('git', 'git')
 
         self.path = path
@@ -184,8 +195,10 @@ 
         if full:
             raise util.Abort(_("convert from git do not support --full"))
         self.modecache = {}
-        fh = self.gitopen("git diff-tree -z --root -m -r %s" % version)
+        fh = self.gitopen("git diff-tree -z --root -m -r %s %s" % (
+            self.simopt, version))
         changes = []
+        copies = {}
         seen = set()
         entry = None
         subexists = [False]
@@ -194,15 +207,16 @@ 
         lcount = len(difftree)
         i = 0
 
-        def add(entry, f):
+        def add(entry, f, isdest):
             seen.add(f)
             h = entry[3]
             p = (entry[1] == "100755")
             s = (entry[1] == "120000")
+            renamesource = (not isdest and entry[4][0] == 'R')
 
             if f == '.gitmodules':
                 subexists[0] = True
-                if entry[4] == 'D':
+                if entry[4] == 'D' or renamesource:
                     subdeleted[0] = True
                     changes.append(('.hgsub', hex(nullid)))
                 else:
@@ -210,6 +224,8 @@ 
             elif entry[1] == '160000' or entry[0] == ':160000':
                 subexists[0] = True
             else:
+                if renamesource:
+                    h = hex(nullid)
                 self.modecache[(f, h)] = (p and "x") or (s and "l") or ""
                 changes.append((f, h))
 
@@ -223,7 +239,19 @@ 
                 continue
             f = l
             if f not in seen:
-                add(entry, f)
+                add(entry, f, False)
+            # A file can be copied multiple times, or modified and copied
+            # simultaneously. So f can be repeated even if fdest isn't.
+            if entry[4][0] in 'RC':
+                # rename or copy: next line is the destination
+                fdest = difftree[i]
+                i += 1
+                if fdest not in seen:
+                    add(entry, fdest, True)
+                    # .gitmodules isn't imported at all, so it being copied to
+                    # and fro doesn't really make sense
+                    if f != '.gitmodules' and fdest != '.gitmodules':
+                        copies[fdest] = f
             entry = None
         if fh.close():
             raise util.Abort(_('cannot read changes in %s') % version)
@@ -234,7 +262,7 @@ 
             else:
                 self.retrievegitmodules(version)
                 changes.append(('.hgsubstate', ''))
-        return (changes, {})
+        return (changes, copies)
 
     def getcommit(self, version):
         c = self.catfile(version, "commit") # read the commit hash
diff --git a/tests/test-convert-git.t b/tests/test-convert-git.t
--- a/tests/test-convert-git.t
+++ b/tests/test-convert-git.t
@@ -241,8 +241,45 @@ 
   9277c9cc8dd4576fc01a17939b4351e5ada93466 644   foo
   88dfeab657e8cf2cef3dec67b914f49791ae76b1 644   quux
 
+test importing git renames and copies
+
+  $ cd git-repo2
+  $ git mv foo foo-renamed
+since bar is not touched in this commit, this copy will not be detected
+  $ cp bar bar-copied
+  $ cp baz baz-copied
+  $ cp baz baz-copied2
+  $ echo baz2 >> baz
+  $ git add bar-copied baz-copied baz-copied2
+  $ commit -a -m 'rename and copy'
+  $ cd ..
+
+input validation
+  $ hg convert --config convert.git.similarity=foo --datesort git-repo2 fullrepo
+  abort: convert.git.similarity must be a number
+  [255]
+  $ hg convert --config convert.git.similarity=-1 --datesort git-repo2 fullrepo
+  abort: similarity must be between 0 and 100
+  [255]
+  $ hg convert --config convert.git.similarity=101 --datesort git-repo2 fullrepo
+  abort: similarity must be between 0 and 100
+  [255]
+
+  $ hg -q convert --config convert.git.similarity=100 --datesort git-repo2 fullrepo
+  $ hg -R fullrepo status -C --change master
+  M baz
+  A bar-copied
+  A baz-copied
+    baz
+  A baz-copied2
+    baz
+  A foo-renamed
+    foo
+  R foo
+
 test binary conversion (issue1359)
 
+  $ count=19
   $ mkdir git-repo3
   $ cd git-repo3
   $ git init-db >/dev/null 2>/dev/null
@@ -398,6 +435,29 @@ 
 
   $ cd ../..
 
+make sure rename detection doesn't break removing and adding gitmodules
+
+  $ cd git-repo6
+  $ git mv .gitmodules .gitmodules-renamed
+  $ commit -a -m 'rename .gitmodules'
+  $ git mv .gitmodules-renamed .gitmodules
+  $ commit -a -m 'rename .gitmodules back'
+  $ cd ..
+
+  $ hg --config convert.git.similarity=100 convert -q git-repo6 git-repo6-hg
+  $ hg -R git-repo6-hg log -r 'tip^' -T "{desc|firstline}\n"
+  rename .gitmodules
+  $ hg -R git-repo6-hg status -C --change 'tip^'
+  A .gitmodules-renamed
+  R .hgsub
+  R .hgsubstate
+  $ hg -R git-repo6-hg log -r tip -T "{desc|firstline}\n"
+  rename .gitmodules back
+  $ hg -R git-repo6-hg status -C --change tip
+  A .hgsub
+  A .hgsubstate
+  R .gitmodules-renamed
+
 convert the revision removing '.gitmodules' itself (and related
 submodules)
 
diff --git a/tests/test-convert.t b/tests/test-convert.t
--- a/tests/test-convert.t
+++ b/tests/test-convert.t
@@ -244,6 +244,16 @@ 
       converted to bookmarks with the same name, with the leading 'refs/heads'
       stripped. Git submodules are converted to Git subrepos in Mercurial.
   
+      The following options can be set with "--config":
+  
+      convert.git.similarity
+                    specify how similar files modified in a commit must be to be
+                    imported as renames or copies, as a percentage between "0"
+                    (disabled) and "100" (files must be identical). For example,
+                    "90" means that a delete/add pair will be imported as a
+                    rename if more than 90% of the file hasn't changed. The
+                    default is "0".
+  
       Perforce Source
       ###############