Patchwork diff: do not split function name if character encoding is unknown

login
register
mail settings
Submitter Yuya Nishihara
Date Feb. 23, 2018, 2:53 p.m.
Message ID <98cfd7926442dc0a649e.1519397598@mimosa>
Download mbox | patch
Permalink /patch/28277/
State Accepted
Headers show

Comments

Yuya Nishihara - Feb. 23, 2018, 2:53 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1519394998 -32400
#      Fri Feb 23 23:09:58 2018 +0900
# Node ID 98cfd7926442dc0a649e0359455ad6962815bd13
# Parent  b8d0761a85c7421071750de23228415306852d69
diff: do not split function name if character encoding is unknown

Only ASCII characters can be split reliably at any byte positions, so let's
just leave long multi-byte sequence long. It's probably less bad than putting
an invalid byte sequence into a diff.

This doesn't try to split the first ASCII slice from multi-byte sequence
because a combining character may follow.
Josef 'Jeff' Sipek - Feb. 23, 2018, 4:03 p.m.
On Fri, Feb 23, 2018 at 23:53:18 +0900, Yuya Nishihara wrote:
> # HG changeset patch
> # User Yuya Nishihara <yuya@tcha.org>
> # Date 1519394998 -32400
> #      Fri Feb 23 23:09:58 2018 +0900
> # Node ID 98cfd7926442dc0a649e0359455ad6962815bd13
> # Parent  b8d0761a85c7421071750de23228415306852d69
> diff: do not split function name if character encoding is unknown
> 
> Only ASCII characters can be split reliably at any byte positions, so let's
> just leave long multi-byte sequence long. It's probably less bad than putting
> an invalid byte sequence into a diff.
> 
> This doesn't try to split the first ASCII slice from multi-byte sequence
> because a combining character may follow.

I like it!

Thanks,

Jeff.

> 
> diff --git a/mercurial/mdiff.py b/mercurial/mdiff.py
> --- a/mercurial/mdiff.py
> +++ b/mercurial/mdiff.py
> @@ -13,6 +13,7 @@ import zlib
>  
>  from .i18n import _
>  from . import (
> +    encoding,
>      error,
>      policy,
>      pycompat,
> @@ -348,7 +349,11 @@ def _unidiff(t1, t2, opts=defaultopts):
>              # alphanumeric char.
>              for i in xrange(astart - 1, lastpos - 1, -1):
>                  if l1[i][0:1].isalnum():
> -                    func = ' ' + l1[i].rstrip()[:40]
> +                    func = b' ' + l1[i].rstrip()
> +                    # split long function name if ASCII. otherwise we have no
> +                    # idea where the multi-byte boundary is, so just leave it.
> +                    if encoding.isasciistr(func):
> +                        func = func[:41]
>                      lastfunc[1] = func
>                      break
>              # by recording this hunk's starting point as the next place to
> diff --git a/tests/test-diff-unified.t b/tests/test-diff-unified.t
> --- a/tests/test-diff-unified.t
> +++ b/tests/test-diff-unified.t
> @@ -386,3 +386,73 @@ If [diff] git is set to true, but the us
>     }
>  
>    $ cd ..
> +
> +Long function names should be abbreviated, but multi-byte character shouldn't
> +be broken up
> +
> +  $ hg init longfunc
> +  $ cd longfunc
> +
> +  >>> with open('a', 'wb') as f:
> +  ...     f.write(b'a' * 39 + b'bb' + b'\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b' 0 b\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b'a' * 39 + b'\xc3\xa0' + b'\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b' 0 a with grave (single code point)\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b'a' * 39 + b'a\xcc\x80' + b'\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b' 0 a with grave (composition)\n')
> +  ...     f.write(b' .\n' * 3)
> +  $ hg ci -qAm0
> +
> +  >>> with open('a', 'wb') as f:
> +  ...     f.write(b'a' * 39 + b'bb' + b'\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b' 1 b\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b'a' * 39 + b'\xc3\xa0' + b'\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b' 1 a with grave (single code point)\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b'a' * 39 + b'a\xcc\x80' + b'\n')
> +  ...     f.write(b' .\n' * 3)
> +  ...     f.write(b' 1 a with grave (composition)\n')
> +  ...     f.write(b' .\n' * 3)
> +  $ hg ci -m1
> +
> +  $ hg diff -c1 --nodates --show-function
> +  diff -r 3e92dd6fa812 -r a256341606cb a
> +  --- a/a
> +  +++ b/a
> +  @@ -2,7 +2,7 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab
> +    .
> +    .
> +    .
> +  - 0 b
> +  + 1 b
> +    .
> +    .
> +    .
> +  @@ -10,7 +10,7 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xc3\xa0 (esc)
> +    .
> +    .
> +    .
> +  - 0 a with grave (single code point)
> +  + 1 a with grave (single code point)
> +    .
> +    .
> +    .
> +  @@ -18,7 +18,7 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xcc\x80 (esc)
> +    .
> +    .
> +    .
> +  - 0 a with grave (composition)
> +  + 1 a with grave (composition)
> +    .
> +    .
> +    .
> +
> +  $ cd ..
Augie Fackler - Feb. 26, 2018, 4:10 a.m.
On Fri, Feb 23, 2018 at 11:53:18PM +0900, Yuya Nishihara wrote:
> # HG changeset patch
> # User Yuya Nishihara <yuya@tcha.org>
> # Date 1519394998 -32400
> #      Fri Feb 23 23:09:58 2018 +0900
> # Node ID 98cfd7926442dc0a649e0359455ad6962815bd13
> # Parent  b8d0761a85c7421071750de23228415306852d69
> diff: do not split function name if character encoding is unknown

queued, thanks

Patch

diff --git a/mercurial/mdiff.py b/mercurial/mdiff.py
--- a/mercurial/mdiff.py
+++ b/mercurial/mdiff.py
@@ -13,6 +13,7 @@  import zlib
 
 from .i18n import _
 from . import (
+    encoding,
     error,
     policy,
     pycompat,
@@ -348,7 +349,11 @@  def _unidiff(t1, t2, opts=defaultopts):
             # alphanumeric char.
             for i in xrange(astart - 1, lastpos - 1, -1):
                 if l1[i][0:1].isalnum():
-                    func = ' ' + l1[i].rstrip()[:40]
+                    func = b' ' + l1[i].rstrip()
+                    # split long function name if ASCII. otherwise we have no
+                    # idea where the multi-byte boundary is, so just leave it.
+                    if encoding.isasciistr(func):
+                        func = func[:41]
                     lastfunc[1] = func
                     break
             # by recording this hunk's starting point as the next place to
diff --git a/tests/test-diff-unified.t b/tests/test-diff-unified.t
--- a/tests/test-diff-unified.t
+++ b/tests/test-diff-unified.t
@@ -386,3 +386,73 @@  If [diff] git is set to true, but the us
    }
 
   $ cd ..
+
+Long function names should be abbreviated, but multi-byte character shouldn't
+be broken up
+
+  $ hg init longfunc
+  $ cd longfunc
+
+  >>> with open('a', 'wb') as f:
+  ...     f.write(b'a' * 39 + b'bb' + b'\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b' 0 b\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b'a' * 39 + b'\xc3\xa0' + b'\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b' 0 a with grave (single code point)\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b'a' * 39 + b'a\xcc\x80' + b'\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b' 0 a with grave (composition)\n')
+  ...     f.write(b' .\n' * 3)
+  $ hg ci -qAm0
+
+  >>> with open('a', 'wb') as f:
+  ...     f.write(b'a' * 39 + b'bb' + b'\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b' 1 b\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b'a' * 39 + b'\xc3\xa0' + b'\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b' 1 a with grave (single code point)\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b'a' * 39 + b'a\xcc\x80' + b'\n')
+  ...     f.write(b' .\n' * 3)
+  ...     f.write(b' 1 a with grave (composition)\n')
+  ...     f.write(b' .\n' * 3)
+  $ hg ci -m1
+
+  $ hg diff -c1 --nodates --show-function
+  diff -r 3e92dd6fa812 -r a256341606cb a
+  --- a/a
+  +++ b/a
+  @@ -2,7 +2,7 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab
+    .
+    .
+    .
+  - 0 b
+  + 1 b
+    .
+    .
+    .
+  @@ -10,7 +10,7 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xc3\xa0 (esc)
+    .
+    .
+    .
+  - 0 a with grave (single code point)
+  + 1 a with grave (single code point)
+    .
+    .
+    .
+  @@ -18,7 +18,7 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\xcc\x80 (esc)
+    .
+    .
+    .
+  - 0 a with grave (composition)
+  + 1 a with grave (composition)
+    .
+    .
+    .
+
+  $ cd ..