Patchwork convert: drastically speed up git conversions

login
register
mail settings
Submitter David Schleimer
Date May 29, 2014, 5:43 p.m.
Message ID <6d2cebe560482e230509.1401385397@devbig100.prn2.facebook.com>
Download mbox | patch
Permalink /patch/4895/
State Accepted
Commit 90e3fcd9e6e4e68d7cedb841ea921d6622db29ce
Headers show

Comments

David Schleimer - May 29, 2014, 5:43 p.m.
# HG changeset patch
# User David Schleimer <dschleimer@fb.com>
# Date 1401250344 25200
#      Tue May 27 21:12:24 2014 -0700
# Node ID 6d2cebe560482e230509a0c4ba468d8e5bb50ddd
# Parent  652e07debf10193f4973a48ead96a95e81d0a55b
convert: drastically speed up git conversions

We would formerly exec git cat-file once for every commit, plus once for
every tree and file we wnated to read.  This switches to using git
cat-file's batch mode, which is much, much, much faster.

Using this new code, converting the git git repo to hg ran in 106
minutes on my machine.  Using the stock mercurial, it required 1239
minutes.  I believe this to be typical of the speedups we will see
form this patch.
Augie Fackler - May 29, 2014, 5:52 p.m.
On May 29, 2014, at 1:43 PM, David Schleimer <dschleimer@fb.com> wrote:

> # HG changeset patch
> # User David Schleimer <dschleimer@fb.com>
> # Date 1401250344 25200
> #      Tue May 27 21:12:24 2014 -0700
> # Node ID 6d2cebe560482e230509a0c4ba468d8e5bb50ddd
> # Parent  652e07debf10193f4973a48ead96a95e81d0a55b
> convert: drastically speed up git conversions

Queued enthusiastically. Thanks a ton!

(Used git log --grep 'cat-file --batch' and gave up when I was back in 2009 and the flag was still mentioned, so I'm thinking this is pretty safe.)

> 
> We would formerly exec git cat-file once for every commit, plus once for
> every tree and file we wnated to read.  This switches to using git
> cat-file's batch mode, which is much, much, much faster.
> 
> Using this new code, converting the git git repo to hg ran in 106
> minutes on my machine.  Using the stock mercurial, it required 1239
> minutes.  I believe this to be typical of the speedups we will see
> form this patch.
> 
> diff --git a/hgext/convert/git.py b/hgext/convert/git.py
> --- a/hgext/convert/git.py
> +++ b/hgext/convert/git.py
> @@ -46,6 +46,18 @@
>                     del os.environ['GIT_DIR']
>                 else:
>                     os.environ['GIT_DIR'] = prevgitdir
> +
> +        def gitpipe(self, s):
> +            prevgitdir = os.environ.get('GIT_DIR')
> +            os.environ['GIT_DIR'] = self.path
> +            try:
> +                return util.popen3(s)
> +            finally:
> +                if prevgitdir is None:
> +                    del os.environ['GIT_DIR']
> +                else:
> +                    os.environ['GIT_DIR'] = prevgitdir
> +
>     else:
>         def gitopen(self, s, err=None):
>             if err == subprocess.PIPE:
> @@ -56,6 +68,9 @@
>             else:
>                 return util.popen('GIT_DIR=%s %s' % (self.path, s), 'rb')
> 
> +        def gitpipe(self, s):
> +            return util.popen3('GIT_DIR=%s %s' % (self.path, s))
> +
>     def popen_with_stderr(self, s):
>         p = subprocess.Popen(s, shell=True, bufsize=-1,
>                              close_fds=util.closefds,
> @@ -84,6 +99,12 @@
>         self.path = path
>         self.submodules = []
> 
> +        self.catfilepipe = self.gitpipe('git cat-file --batch')
> +
> +    def after(self):
> +        for f in self.catfilepipe:
> +            f.close()
> +
>     def getheads(self):
>         if not self.rev:
>             heads, ret = self.gitread('git rev-parse --branches --remotes')
> @@ -98,9 +119,17 @@
>     def catfile(self, rev, type):
>         if rev == hex(nullid):
>             raise IOError
> -        data, ret = self.gitread("git cat-file %s %s" % (type, rev))
> -        if ret:
> +        self.catfilepipe[0].write(rev+'\n')
> +        self.catfilepipe[0].flush()
> +        info = self.catfilepipe[1].readline().split()
> +        if info[1] != type:
>             raise util.Abort(_('cannot read %r object at %s') % (type, rev))
> +        size = int(info[2])
> +        data = self.catfilepipe[1].read(size)
> +        if len(data) < size:
> +            raise util.Abort(_('cannot read %r object at %s: %s') % (type, rev))
> +        # read the trailing newline
> +        self.catfilepipe[1].read(1)
>         return data
> 
>     def getfile(self, name, rev):
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel

Patch

diff --git a/hgext/convert/git.py b/hgext/convert/git.py
--- a/hgext/convert/git.py
+++ b/hgext/convert/git.py
@@ -46,6 +46,18 @@ 
                     del os.environ['GIT_DIR']
                 else:
                     os.environ['GIT_DIR'] = prevgitdir
+
+        def gitpipe(self, s):
+            prevgitdir = os.environ.get('GIT_DIR')
+            os.environ['GIT_DIR'] = self.path
+            try:
+                return util.popen3(s)
+            finally:
+                if prevgitdir is None:
+                    del os.environ['GIT_DIR']
+                else:
+                    os.environ['GIT_DIR'] = prevgitdir
+
     else:
         def gitopen(self, s, err=None):
             if err == subprocess.PIPE:
@@ -56,6 +68,9 @@ 
             else:
                 return util.popen('GIT_DIR=%s %s' % (self.path, s), 'rb')
 
+        def gitpipe(self, s):
+            return util.popen3('GIT_DIR=%s %s' % (self.path, s))
+
     def popen_with_stderr(self, s):
         p = subprocess.Popen(s, shell=True, bufsize=-1,
                              close_fds=util.closefds,
@@ -84,6 +99,12 @@ 
         self.path = path
         self.submodules = []
 
+        self.catfilepipe = self.gitpipe('git cat-file --batch')
+
+    def after(self):
+        for f in self.catfilepipe:
+            f.close()
+
     def getheads(self):
         if not self.rev:
             heads, ret = self.gitread('git rev-parse --branches --remotes')
@@ -98,9 +119,17 @@ 
     def catfile(self, rev, type):
         if rev == hex(nullid):
             raise IOError
-        data, ret = self.gitread("git cat-file %s %s" % (type, rev))
-        if ret:
+        self.catfilepipe[0].write(rev+'\n')
+        self.catfilepipe[0].flush()
+        info = self.catfilepipe[1].readline().split()
+        if info[1] != type:
             raise util.Abort(_('cannot read %r object at %s') % (type, rev))
+        size = int(info[2])
+        data = self.catfilepipe[1].read(size)
+        if len(data) < size:
+            raise util.Abort(_('cannot read %r object at %s: %s') % (type, rev))
+        # read the trailing newline
+        self.catfilepipe[1].read(1)
         return data
 
     def getfile(self, name, rev):