Patchwork [09,of,10] phabricator: avoid calling differential.getrawdiff

login
register
mail settings
Submitter Jun Wu
Date July 5, 2017, 1:58 a.m.
Message ID <0e2c9cf54e09bacb4a01.1499219914@x1c>
Download mbox | patch
Permalink /patch/22006/
State Accepted
Headers show

Comments

Jun Wu - July 5, 2017, 1:58 a.m.
# HG changeset patch
# User Jun Wu <quark@fb.com>
# Date 1499219074 25200
#      Tue Jul 04 18:44:34 2017 -0700
# Node ID 0e2c9cf54e09bacb4a019bc51693a9fecf1051a3
# Parent  0f202460887f66efa3ddbd1d9a7d8afca75c0114
# Available At https://bitbucket.org/quark-zju/hg-draft
#              hg pull https://bitbucket.org/quark-zju/hg-draft -r 0e2c9cf54e09
phabricator: avoid calling differential.getrawdiff

Previously, we call differential.getrawdiff per patch despite we have the
Jun Wu - July 5, 2017, 6:17 p.m.
Excerpts from Jun Wu's message of 2017-07-04 18:58:34 -0700:
> +        path = encoding.strtolocal(c[r'currentPath'])

This should be `unitolocal`, not `strtolocal`. I can send a V2 or it could
be sed -i fixed.
Augie Fackler - July 5, 2017, 10:29 p.m.
On Tue, Jul 04, 2017 at 06:58:34PM -0700, Jun Wu wrote:
> # HG changeset patch
> # User Jun Wu <quark@fb.com>
> # Date 1499219074 25200
> #      Tue Jul 04 18:44:34 2017 -0700
> # Node ID 0e2c9cf54e09bacb4a019bc51693a9fecf1051a3
> # Parent  0f202460887f66efa3ddbd1d9a7d8afca75c0114
> # Available At https://bitbucket.org/quark-zju/hg-draft
> #              hg pull https://bitbucket.org/quark-zju/hg-draft -r 0e2c9cf54e09
> phabricator: avoid calling differential.getrawdiff

[...]

>
> diff --git a/contrib/phabricator.py b/contrib/phabricator.py
> --- a/contrib/phabricator.py
> +++ b/contrib/phabricator.py
> @@ -410,4 +410,95 @@ def getdescfromdrev(drev):
>      return '\n\n'.join(filter(None, [title, summary, testplan, uri]))
>
> +# see https://secure.phabricator.com/diffusion/ARC/browse/master/src/parser/
> +# diff/ArcanistDiffChangeType.php for those constants
> +class changetype(object):
> +    ADD = 1
> +    CHANGE = 2
> +    DELETE = 3
> +    MOVE_AWAY = 4
> +    COPY_AWAY = 5
> +    MOVE_HERE = 6
> +    COPY_HERE = 7
> +    MULTICOPY = 8

Oy. I'm not in love with having to clone their enum here. How stable
can we assume these values are, given that you had to slurp them out
of the arcanist source and not some API documentation?

> +
> +class filetype(object):
> +    TEXT = 1
> +
> +def getpatchbodyfromdiff(diff):
> +    """get patch body (without commit message or metadata) from a diff dict
> +
> +    This is similar to differential.getrawdiff API. But we reconstruct the diff
> +    from a diff object fetched earlier via differential.querydiffs API.
> +
> +    Currently it does not support binary files. However it seems the
> +    "differential.createrawdiff" API (used by phabsend) couldn't really handle
> +    base85 binaries so binary files are broken at uploading time, therefore
> +    this limitation is probably fine.
> +    """
> +    out = util.stringio()
> +    write = out.write
> +
> +    for c in diff[r'changes']:
> +        path = encoding.strtolocal(c[r'currentPath'])
> +        oldpath = encoding.strtolocal(c[r'oldPath'])
> +        patha = 'a/%s' % (oldpath or path)
> +        pathb = 'b/%s' % (path or oldpath)
> +
> +        ftype = int(c[r'fileType'])
> +        if ftype != filetype.TEXT:
> +            raise error.Abort(_('unsupported file type %s: %s') % (ftype, path))
> +
> +        ctype = int(c[r'type'])
> +        if ctype in [changetype.MULTICOPY, changetype.MOVE_AWAY,
> +                     changetype.COPY_AWAY]:
> +            # ignore these, COPY_HERE or MOVE_HERE will cover them
> +            continue
> +
> +        def getmode(name):
> +            s = (c[name] or {}).get(r'unix:filemode', r'100644')
> +            return encoding.strtolocal(s)
> +
> +        oldmode = getmode(r'oldProperties')
> +        newmode = getmode(r'newProperties')
> +
> +        write('diff --git %s %s\n' % (patha, pathb))
> +        if ctype == changetype.ADD:
> +            write('new file mode %s\n' % newmode)
> +            patha = '/dev/null'
> +        elif ctype == changetype.DELETE:
> +            write('deleted file mode %s\n' % oldmode)
> +            pathb = '/dev/null'
> +        else:
> +            if oldmode != newmode:
> +                write('old mode %s\nnew mode %s\n' % (oldmode, newmode))
> +            if ctype == changetype.CHANGE:
> +                pass
> +            elif ctype == changetype.MOVE_HERE:
> +                write('rename from %s\nrename to %s\n' % (oldpath, path))
> +            elif ctype == changetype.COPY_HERE:
> +                write('copy from %s\ncopy to %s\n' % (oldpath, path))
> +            else:
> +                raise error.Abort(_('unsupported change type %s: %s')
> +                                  % (ctype, path))
> +
> +        if c[r'hunks']:
> +            write('--- %s\n+++ %s\n' % (patha, pathb))
> +        for h in c[r'hunks']:
> +            oldoff = int(h[r'oldOffset'])
> +            oldlen = int(h[r'oldLength'])
> +            newoff = int(h[r'newOffset'])
> +            newlen = int(h[r'newLength'])
> +            write('@@ -%d,%d +%d,%d @@\n' % (oldoff, oldlen, newoff, newlen))
> +            write(encoding.strtolocal(h[r'corpus']))
> +
> +    # normalize the patch by trimming context to 3 lines
> +    headers = patch.parsepatch(out.getvalue(), maxcontext=3)
> +    out = util.stringio()
> +    for header in headers:
> +        header.write(out)
> +        for hunk in header.hunks:
> +            hunk.write(out)
> +    return out.getvalue()
> +
>  def readpatch(repo, params, write, stack=False):
>      """generate plain-text patch readable by 'hg import'
> @@ -425,8 +516,6 @@ def readpatch(repo, params, write, stack
>      # Generate patch for each drev
>      for drev in drevs:
> -        repo.ui.note(_('reading D%s\n') % drev[r'id'])
> -
>          diffid = max(int(v) for v in drev[r'diffs'])
> -        body = callconduit(repo, 'differential.getrawdiff', {'diffID': diffid})
> +        body = getpatchbodyfromdiff(diffs[str(diffid)])
>          desc = getdescfromdrev(drev)
>          header = '# HG changeset patch\n'
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Jun Wu - July 5, 2017, 11:19 p.m.
Excerpts from Augie Fackler's message of 2017-07-05 18:29:30 -0400:
> Oy. I'm not in love with having to clone their enum here. How stable
> can we assume these values are, given that you had to slurp them out
> of the arcanist source and not some API documentation?

git log --follow -p shows they never got changed since the first commit of
the repo [1]. There were zero new enums got added. Given the code history
and the fact that changing things here will break existing arcanist client
too, I think the enums are pretty stable.

Talking about documentation, the formal Phabricator API documentation is
very limited - most APIs only have first level type information for input
parameters.  There is almost zero text about nested parameters or output
parameters for APIs like differential.query [2]. So the source code is the
only available documentation.

Note: "differential.query" is marked as "Frozen", but arcanist is still
using it and it seems to be the only way to get the dependency information.

[1]: https://secure.phabricator.com/rARC2e73916fa20456af6b737c1208bfa3da0cc679ec
[2]: https://phab.mercurial-scm.org/conduit/method/differential.query/

Patch

diff content already (returned by differential.querydiffs).

This patch implements the serialization logic of `differential.getrawdiff`
client-side so we no longer need calling differential.getrawdiff. This
removes one API call per patch.

Now fetching a stack of 10 patches is just 2 API calls in the best case.
Worst case batching does not work and it's 11 API calls. The "reading"
verbose message was removed since there is no remote IO needed to read a
patch in that loop now.

Using a test Phabricator instance, this patch reduces `phabread` reading a
10-patch stack from about 8 seconds to 1.3 seconds. Even with batching
disabled, reading that 10-patch stack could be done in 4 seconds.

Test Plan:

Create an interesting (contains multiple copies and renames, mixed mode
change with content changes) patch using the below script:

    hg init

    # long context could be more interesting
    seq 1 200 > long
    for i in a b c d e f g; do
      echo $i > $i
    done
    hg add .
    hg commit -m init

    sed -i '/10/d;s/22$/22.1\n22.2/g' long

    # multiple copies and moves
    for i in a b; do
      for j in 1 2 3 4; do
        hg cp $i "${i}${j}"
        [ $((j % 2)) = 0 ] && chmod +x "${i}${j}"
        [ $j -ge 3 ] && echo append >> "${i}${j}"
      done
    done

    hg rm a
    chmod +x c
    hg rm d
    hg mv e E
    chmod +x E
    rm f
    touch f
    hg commit -m change

Send it using `phabsend`, and retrieve it using `phabread`. Compare before
and after this patch. The only differences are:

  - This patch removes a blank line after a patch.
  - This patch does not omit `,1` in diff hunks. See [1].

Other than that, the patch output is exactly the same. So the new code path
looks reasonably good.

[1]: https://secure.phabricator.com/diffusion/ARC/browse/master/src/parser/
ArcanistBundle.php;e00e2939c164df85852ce2d157de4197b399c30b$631

diff --git a/contrib/phabricator.py b/contrib/phabricator.py
--- a/contrib/phabricator.py
+++ b/contrib/phabricator.py
@@ -410,4 +410,95 @@  def getdescfromdrev(drev):
     return '\n\n'.join(filter(None, [title, summary, testplan, uri]))
 
+# see https://secure.phabricator.com/diffusion/ARC/browse/master/src/parser/
+# diff/ArcanistDiffChangeType.php for those constants
+class changetype(object):
+    ADD = 1
+    CHANGE = 2
+    DELETE = 3
+    MOVE_AWAY = 4
+    COPY_AWAY = 5
+    MOVE_HERE = 6
+    COPY_HERE = 7
+    MULTICOPY = 8
+
+class filetype(object):
+    TEXT = 1
+
+def getpatchbodyfromdiff(diff):
+    """get patch body (without commit message or metadata) from a diff dict
+
+    This is similar to differential.getrawdiff API. But we reconstruct the diff
+    from a diff object fetched earlier via differential.querydiffs API.
+
+    Currently it does not support binary files. However it seems the
+    "differential.createrawdiff" API (used by phabsend) couldn't really handle
+    base85 binaries so binary files are broken at uploading time, therefore
+    this limitation is probably fine.
+    """
+    out = util.stringio()
+    write = out.write
+
+    for c in diff[r'changes']:
+        path = encoding.strtolocal(c[r'currentPath'])
+        oldpath = encoding.strtolocal(c[r'oldPath'])
+        patha = 'a/%s' % (oldpath or path)
+        pathb = 'b/%s' % (path or oldpath)
+
+        ftype = int(c[r'fileType'])
+        if ftype != filetype.TEXT:
+            raise error.Abort(_('unsupported file type %s: %s') % (ftype, path))
+
+        ctype = int(c[r'type'])
+        if ctype in [changetype.MULTICOPY, changetype.MOVE_AWAY,
+                     changetype.COPY_AWAY]:
+            # ignore these, COPY_HERE or MOVE_HERE will cover them
+            continue
+
+        def getmode(name):
+            s = (c[name] or {}).get(r'unix:filemode', r'100644')
+            return encoding.strtolocal(s)
+
+        oldmode = getmode(r'oldProperties')
+        newmode = getmode(r'newProperties')
+
+        write('diff --git %s %s\n' % (patha, pathb))
+        if ctype == changetype.ADD:
+            write('new file mode %s\n' % newmode)
+            patha = '/dev/null'
+        elif ctype == changetype.DELETE:
+            write('deleted file mode %s\n' % oldmode)
+            pathb = '/dev/null'
+        else:
+            if oldmode != newmode:
+                write('old mode %s\nnew mode %s\n' % (oldmode, newmode))
+            if ctype == changetype.CHANGE:
+                pass
+            elif ctype == changetype.MOVE_HERE:
+                write('rename from %s\nrename to %s\n' % (oldpath, path))
+            elif ctype == changetype.COPY_HERE:
+                write('copy from %s\ncopy to %s\n' % (oldpath, path))
+            else:
+                raise error.Abort(_('unsupported change type %s: %s')
+                                  % (ctype, path))
+
+        if c[r'hunks']:
+            write('--- %s\n+++ %s\n' % (patha, pathb))
+        for h in c[r'hunks']:
+            oldoff = int(h[r'oldOffset'])
+            oldlen = int(h[r'oldLength'])
+            newoff = int(h[r'newOffset'])
+            newlen = int(h[r'newLength'])
+            write('@@ -%d,%d +%d,%d @@\n' % (oldoff, oldlen, newoff, newlen))
+            write(encoding.strtolocal(h[r'corpus']))
+
+    # normalize the patch by trimming context to 3 lines
+    headers = patch.parsepatch(out.getvalue(), maxcontext=3)
+    out = util.stringio()
+    for header in headers:
+        header.write(out)
+        for hunk in header.hunks:
+            hunk.write(out)
+    return out.getvalue()
+
 def readpatch(repo, params, write, stack=False):
     """generate plain-text patch readable by 'hg import'
@@ -425,8 +516,6 @@  def readpatch(repo, params, write, stack
     # Generate patch for each drev
     for drev in drevs:
-        repo.ui.note(_('reading D%s\n') % drev[r'id'])
-
         diffid = max(int(v) for v in drev[r'diffs'])
-        body = callconduit(repo, 'differential.getrawdiff', {'diffID': diffid})
+        body = getpatchbodyfromdiff(diffs[str(diffid)])
         desc = getdescfromdrev(drev)
         header = '# HG changeset patch\n'