Patchwork [2,of,2] bdiff: early pruning of common suffix before doing expensive computations

login
register
mail settings
Submitter Mads Kiilerich
Date Nov. 17, 2016, 5:16 p.m.
Message ID <851a14d5b80639efe61b.1479403000@madski>
Download mbox | patch
Permalink /patch/17618/
State Accepted
Headers show

Comments

Mads Kiilerich - Nov. 17, 2016, 5:16 p.m.
# HG changeset patch
# User Mads Kiilerich <madski@unity3d.com>
# Date 1479322051 -3600
#      Wed Nov 16 19:47:31 2016 +0100
# Node ID 851a14d5b80639efe61bd01ad215479c99106b40
# Parent  7f39bccc1c96bffc83f3c6e774da57ecd38982a7
bdiff: early pruning of common suffix before doing expensive computations

Similar to the previous change, but trimming trailing lines.

On mozilla-unified:
perfbdiff -m 3041e4d59df2
! wall 0.024618 comb 0.020000 user 0.020000 sys 0.000000 (best of 116) to
! wall 0.008259 comb 0.010000 user 0.010000 sys 0.000000 (best of 356) to
perfbdiff 0e9928989e9c --alldata --count 10
! wall 0.579235 comb 0.580000 user 0.580000 sys 0.000000 (best of 18)
! wall 0.469869 comb 0.460000 user 0.460000 sys 0.000000 (best of 22)
Gregory Szorc - Nov. 17, 2016, 7:26 p.m.
On Thu, Nov 17, 2016 at 9:16 AM, Mads Kiilerich <mads@kiilerich.com> wrote:

> # HG changeset patch
> # User Mads Kiilerich <madski@unity3d.com>
> # Date 1479322051 -3600
> #      Wed Nov 16 19:47:31 2016 +0100
> # Node ID 851a14d5b80639efe61bd01ad215479c99106b40
> # Parent  7f39bccc1c96bffc83f3c6e774da57ecd38982a7
> bdiff: early pruning of common suffix before doing expensive computations
>
> Similar to the previous change, but trimming trailing lines.
>
> On mozilla-unified:
> perfbdiff -m 3041e4d59df2
> ! wall 0.024618 comb 0.020000 user 0.020000 sys 0.000000 (best of 116) to
> ! wall 0.008259 comb 0.010000 user 0.010000 sys 0.000000 (best of 356) to
> perfbdiff 0e9928989e9c --alldata --count 10
> ! wall 0.579235 comb 0.580000 user 0.580000 sys 0.000000 (best of 18)
> ! wall 0.469869 comb 0.460000 user 0.460000 sys 0.000000 (best of 22)
>
>
I can collaborate the perf improvements. On mozilla-unified (using a
different clone and machine from what I wrote in my commit messages
reporting bdiff perf - but the CPU model is the same), these 2 patches
result in:

$ perfbdiff -m 3041e4d59df2
! wall 0.040755 comb 0.040000 user 0.040000 sys 0.000000 (best of 100)
! wall 0.007953 comb 0.010000 user 0.010000 sys 0.000000 (best of 304)

$  perfbdiff 0e9928989e9c --alldata --count 100
! wall 4.577779 comb 4.580000 user 4.580000 sys 0.000000 (best of 3)
! wall 1.748509 comb 1.750000 user 1.750000 sys 0.000000 (best of 6)

Compared to 4.0:
$ perfbdiff -m 3041e4d59df2
! wall 0.076924 comb 0.080000 user 0.080000 sys 0.000000 (best of 100)
! wall 0.007953 comb 0.010000 user 0.010000 sys 0.000000 (best of 304)

$  perfbdiff 0e9928989e9c --alldata --count 100
! wall 8.892216 comb 8.880000 user 8.880000 sys 0.000000 (best of 3)
! wall 1.748509 comb 1.750000 user 1.750000 sys 0.000000 (best of 6)

Looking at the assembly and profiling output, I believe I've confirmed my
suspicions from my comment on part 1: there is still room to optimize this
code. Again, that can be deferred to a follow-up.


> diff --git a/mercurial/bdiff_module.c b/mercurial/bdiff_module.c
> --- a/mercurial/bdiff_module.c
> +++ b/mercurial/bdiff_module.c
> @@ -66,7 +66,7 @@ static PyObject *bdiff(PyObject *self, P
>         struct bdiff_line *al, *bl;
>         struct bdiff_hunk l, *h;
>         int an, bn, count;
> -       Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lmax;
> +       Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lcommonend = 0,
> lmax;
>         PyThreadState *_save;
>
>         l.next = NULL;
> @@ -88,9 +88,16 @@ static PyObject *bdiff(PyObject *self, P
>                 if (*ia == '\n')
>                         lcommon = li + 1;
>         /* we can almost add: if (li == lmax) lcommon = li; */
> +       lmax -= lcommon;
> +       for (li = 0, ia = sa + la - 1, ib = sb + lb - 1;
> +            li <= lmax && *ia == *ib;
> +            ++li, --ia, --ib)
> +               if (*ia == '\n')
> +                       lcommonend = li;
> +       /* printf("common %i %i\n", lcommon, lcommonend); */
>
> -       an = bdiff_splitlines(sa + lcommon, la - lcommon, &al);
> -       bn = bdiff_splitlines(sb + lcommon, lb - lcommon, &bl);
> +       an = bdiff_splitlines(sa + lcommon, la - lcommon - lcommonend,
> &al);
> +       bn = bdiff_splitlines(sb + lcommon, lb - lcommon - lcommonend,
> &bl);
>         if (!al || !bl)
>                 goto nomem;
>
> diff --git a/tests/test-bdiff.py.out b/tests/test-bdiff.py.out
> --- a/tests/test-bdiff.py.out
> +++ b/tests/test-bdiff.py.out
> @@ -28,9 +28,9 @@ showdiff(
>    'x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n'):
>   'x\n\nx\n\n'
>   6 6 '' -> 'y\n\n'
> - 'x\n\n'
> - 9 9 '' -> 'y\n\n'
> - 'x\n\nz\n'
> + 'x\n'
> + 8 8 '' -> '\ny\n'
> + '\nx\n\nz\n'
>  showdiff(
>    'a\nb\nb\nb\nc\n.\nd\ne\n.\nf\n',
>    'a\nb\nb\na\nb\nb\nb\nc\n.\nb\nc\n.\nd\ne\nf\n'):
> diff --git a/tests/test-check-code.t b/tests/test-check-code.t
> --- a/tests/test-check-code.t
> +++ b/tests/test-check-code.t
> @@ -15,10 +15,6 @@ New errors are not allowed. Warnings are
>    Skipping hgext/fsmonitor/pywatchman/msc_stdint.h it has no-che?k-code
> (glob)
>    Skipping hgext/fsmonitor/pywatchman/pybser.py it has no-che?k-code
> (glob)
>    Skipping i18n/polib.py it has no-che?k-code (glob)
> -  mercurial/bdiff_module.c:89:
> -   >   //printf("common prefix: %i next line: %i\n", li, llf);
> -   don't use //-style comments
>    Skipping mercurial/httpclient/__init__.py it has no-che?k-code (glob)
>    Skipping mercurial/httpclient/_readers.py it has no-che?k-code (glob)
>    Skipping mercurial/statprof.py it has no-che?k-code (glob)
> -  [1]
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>
Gregory Szorc - Nov. 17, 2016, 8:29 p.m.
On Thu, Nov 17, 2016 at 11:26 AM, Gregory Szorc <gregory.szorc@gmail.com>
wrote:

> On Thu, Nov 17, 2016 at 9:16 AM, Mads Kiilerich <mads@kiilerich.com>
> wrote:
>
>> # HG changeset patch
>> # User Mads Kiilerich <madski@unity3d.com>
>> # Date 1479322051 -3600
>> #      Wed Nov 16 19:47:31 2016 +0100
>> # Node ID 851a14d5b80639efe61bd01ad215479c99106b40
>> # Parent  7f39bccc1c96bffc83f3c6e774da57ecd38982a7
>> bdiff: early pruning of common suffix before doing expensive computations
>>
>> Similar to the previous change, but trimming trailing lines.
>>
>> On mozilla-unified:
>> perfbdiff -m 3041e4d59df2
>> ! wall 0.024618 comb 0.020000 user 0.020000 sys 0.000000 (best of 116) to
>> ! wall 0.008259 comb 0.010000 user 0.010000 sys 0.000000 (best of 356) to
>> perfbdiff 0e9928989e9c --alldata --count 10
>> ! wall 0.579235 comb 0.580000 user 0.580000 sys 0.000000 (best of 18)
>> ! wall 0.469869 comb 0.460000 user 0.460000 sys 0.000000 (best of 22)
>>
>>
> I can collaborate the perf improvements. On mozilla-unified (using a
> different clone and machine from what I wrote in my commit messages
> reporting bdiff perf - but the CPU model is the same), these 2 patches
> result in:
>
> $ perfbdiff -m 3041e4d59df2
> ! wall 0.040755 comb 0.040000 user 0.040000 sys 0.000000 (best of 100)
> ! wall 0.007953 comb 0.010000 user 0.010000 sys 0.000000 (best of 304)
>
> $  perfbdiff 0e9928989e9c --alldata --count 100
> ! wall 4.577779 comb 4.580000 user 4.580000 sys 0.000000 (best of 3)
> ! wall 1.748509 comb 1.750000 user 1.750000 sys 0.000000 (best of 6)
>
> Compared to 4.0:
> $ perfbdiff -m 3041e4d59df2
> ! wall 0.076924 comb 0.080000 user 0.080000 sys 0.000000 (best of 100)
> ! wall 0.007953 comb 0.010000 user 0.010000 sys 0.000000 (best of 304)
>
> $  perfbdiff 0e9928989e9c --alldata --count 100
> ! wall 8.892216 comb 8.880000 user 8.880000 sys 0.000000 (best of 3)
> ! wall 1.748509 comb 1.750000 user 1.750000 sys 0.000000 (best of 6)
>
> Looking at the assembly and profiling output, I believe I've confirmed my
> suspicions from my comment on part 1: there is still room to optimize this
> code. Again, that can be deferred to a follow-up.
>

It's worth looking at where we're spending CPU time after this series.
Measuring `perfbdiff 0e9928989e9c --alldata --count 100`:

This series
40% bdiff_module.c:bdiff()
38% bidff.c:bdiff_splitlines()
12% bdiff.c:bdiff_diff()
 5% __memcmp_sse4_1 (not sure where exactly this is called from)
 4% bdiff.c:recurse()

Before prefix and suffix matching:
62% bdiff.c:bdiff_splitlines()
23% bdiff.c.bdiff_diff()
 8% memcmp_sse4_1
 5% bdiff.c:recurse()

4.0
82% bdiff.c:bdiff_splitlines()
10% bdiff.c:bdiff_diff()
 4.5% __memcmp_sse4_1
 2.5% bdiff.c:recurse()

This series shifted time out of bdiff_splitlines() *and* bdiff_diff().
Furthermore, from 4.0 to this series, the ratio of time spent before
bdiff_diff() and inside is roughly the same, despite things getting 5-10x
faster in that time! It is alarming we're still spending so much time in
what is effectively set-up to run an algorithm. But this is also a good
thing: we know the setup code can still be optimized which means there are
still significant perf wins on the table.


>
>> diff --git a/mercurial/bdiff_module.c b/mercurial/bdiff_module.c
>> --- a/mercurial/bdiff_module.c
>> +++ b/mercurial/bdiff_module.c
>> @@ -66,7 +66,7 @@ static PyObject *bdiff(PyObject *self, P
>>         struct bdiff_line *al, *bl;
>>         struct bdiff_hunk l, *h;
>>         int an, bn, count;
>> -       Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lmax;
>> +       Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lcommonend = 0,
>> lmax;
>>         PyThreadState *_save;
>>
>>         l.next = NULL;
>> @@ -88,9 +88,16 @@ static PyObject *bdiff(PyObject *self, P
>>                 if (*ia == '\n')
>>                         lcommon = li + 1;
>>         /* we can almost add: if (li == lmax) lcommon = li; */
>> +       lmax -= lcommon;
>> +       for (li = 0, ia = sa + la - 1, ib = sb + lb - 1;
>> +            li <= lmax && *ia == *ib;
>> +            ++li, --ia, --ib)
>> +               if (*ia == '\n')
>> +                       lcommonend = li;
>> +       /* printf("common %i %i\n", lcommon, lcommonend); */
>>
>> -       an = bdiff_splitlines(sa + lcommon, la - lcommon, &al);
>> -       bn = bdiff_splitlines(sb + lcommon, lb - lcommon, &bl);
>> +       an = bdiff_splitlines(sa + lcommon, la - lcommon - lcommonend,
>> &al);
>> +       bn = bdiff_splitlines(sb + lcommon, lb - lcommon - lcommonend,
>> &bl);
>>         if (!al || !bl)
>>                 goto nomem;
>>
>> diff --git a/tests/test-bdiff.py.out b/tests/test-bdiff.py.out
>> --- a/tests/test-bdiff.py.out
>> +++ b/tests/test-bdiff.py.out
>> @@ -28,9 +28,9 @@ showdiff(
>>    'x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n'):
>>   'x\n\nx\n\n'
>>   6 6 '' -> 'y\n\n'
>> - 'x\n\n'
>> - 9 9 '' -> 'y\n\n'
>> - 'x\n\nz\n'
>> + 'x\n'
>> + 8 8 '' -> '\ny\n'
>> + '\nx\n\nz\n'
>>  showdiff(
>>    'a\nb\nb\nb\nc\n.\nd\ne\n.\nf\n',
>>    'a\nb\nb\na\nb\nb\nb\nc\n.\nb\nc\n.\nd\ne\nf\n'):
>> diff --git a/tests/test-check-code.t b/tests/test-check-code.t
>> --- a/tests/test-check-code.t
>> +++ b/tests/test-check-code.t
>> @@ -15,10 +15,6 @@ New errors are not allowed. Warnings are
>>    Skipping hgext/fsmonitor/pywatchman/msc_stdint.h it has no-che?k-code
>> (glob)
>>    Skipping hgext/fsmonitor/pywatchman/pybser.py it has no-che?k-code
>> (glob)
>>    Skipping i18n/polib.py it has no-che?k-code (glob)
>> -  mercurial/bdiff_module.c:89:
>> -   >   //printf("common prefix: %i next line: %i\n", li, llf);
>> -   don't use //-style comments
>>    Skipping mercurial/httpclient/__init__.py it has no-che?k-code (glob)
>>    Skipping mercurial/httpclient/_readers.py it has no-che?k-code (glob)
>>    Skipping mercurial/statprof.py it has no-che?k-code (glob)
>> -  [1]
>> _______________________________________________
>> Mercurial-devel mailing list
>> Mercurial-devel@mercurial-scm.org
>> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>>
>
>
Yuya Nishihara - Dec. 6, 2016, 1:54 p.m.
On Thu, 17 Nov 2016 18:16:40 +0100, Mads Kiilerich wrote:
> # HG changeset patch
> # User Mads Kiilerich <madski@unity3d.com>
> # Date 1479322051 -3600
> #      Wed Nov 16 19:47:31 2016 +0100
> # Node ID 851a14d5b80639efe61bd01ad215479c99106b40
> # Parent  7f39bccc1c96bffc83f3c6e774da57ecd38982a7
> bdiff: early pruning of common suffix before doing expensive computations
> 
> Similar to the previous change, but trimming trailing lines.
> 
> On mozilla-unified:
> perfbdiff -m 3041e4d59df2
> ! wall 0.024618 comb 0.020000 user 0.020000 sys 0.000000 (best of 116) to
> ! wall 0.008259 comb 0.010000 user 0.010000 sys 0.000000 (best of 356) to
> perfbdiff 0e9928989e9c --alldata --count 10
> ! wall 0.579235 comb 0.580000 user 0.580000 sys 0.000000 (best of 18)
> ! wall 0.469869 comb 0.460000 user 0.460000 sys 0.000000 (best of 22)
> 
> diff --git a/mercurial/bdiff_module.c b/mercurial/bdiff_module.c
> --- a/mercurial/bdiff_module.c
> +++ b/mercurial/bdiff_module.c
> @@ -66,7 +66,7 @@ static PyObject *bdiff(PyObject *self, P
>  	struct bdiff_line *al, *bl;
>  	struct bdiff_hunk l, *h;
>  	int an, bn, count;
> -	Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lmax;
> +	Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lcommonend = 0, lmax;
>  	PyThreadState *_save;
>  
>  	l.next = NULL;
> @@ -88,9 +88,16 @@ static PyObject *bdiff(PyObject *self, P
>  		if (*ia == '\n')
>  			lcommon = li + 1;
>  	/* we can almost add: if (li == lmax) lcommon = li; */
> +	lmax -= lcommon;
> +	for (li = 0, ia = sa + la - 1, ib = sb + lb - 1;
> +	     li <= lmax && *ia == *ib;
> +	     ++li, --ia, --ib)
> +		if (*ia == '\n')
> +			lcommonend = li;

Perhaps this could read one byte before the sa and sb. For bdiff('', ''),
li == lmax == 0 but ia == sa[-1].

Patch

diff --git a/mercurial/bdiff_module.c b/mercurial/bdiff_module.c
--- a/mercurial/bdiff_module.c
+++ b/mercurial/bdiff_module.c
@@ -66,7 +66,7 @@  static PyObject *bdiff(PyObject *self, P
 	struct bdiff_line *al, *bl;
 	struct bdiff_hunk l, *h;
 	int an, bn, count;
-	Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lmax;
+	Py_ssize_t len = 0, la, lb, li = 0, lcommon = 0, lcommonend = 0, lmax;
 	PyThreadState *_save;
 
 	l.next = NULL;
@@ -88,9 +88,16 @@  static PyObject *bdiff(PyObject *self, P
 		if (*ia == '\n')
 			lcommon = li + 1;
 	/* we can almost add: if (li == lmax) lcommon = li; */
+	lmax -= lcommon;
+	for (li = 0, ia = sa + la - 1, ib = sb + lb - 1;
+	     li <= lmax && *ia == *ib;
+	     ++li, --ia, --ib)
+		if (*ia == '\n')
+			lcommonend = li;
+	/* printf("common %i %i\n", lcommon, lcommonend); */
 
-	an = bdiff_splitlines(sa + lcommon, la - lcommon, &al);
-	bn = bdiff_splitlines(sb + lcommon, lb - lcommon, &bl);
+	an = bdiff_splitlines(sa + lcommon, la - lcommon - lcommonend, &al);
+	bn = bdiff_splitlines(sb + lcommon, lb - lcommon - lcommonend, &bl);
 	if (!al || !bl)
 		goto nomem;
 
diff --git a/tests/test-bdiff.py.out b/tests/test-bdiff.py.out
--- a/tests/test-bdiff.py.out
+++ b/tests/test-bdiff.py.out
@@ -28,9 +28,9 @@  showdiff(
   'x\n\nx\n\ny\n\nx\n\ny\n\nx\n\nz\n'):
  'x\n\nx\n\n'
  6 6 '' -> 'y\n\n'
- 'x\n\n'
- 9 9 '' -> 'y\n\n'
- 'x\n\nz\n'
+ 'x\n'
+ 8 8 '' -> '\ny\n'
+ '\nx\n\nz\n'
 showdiff(
   'a\nb\nb\nb\nc\n.\nd\ne\n.\nf\n',
   'a\nb\nb\na\nb\nb\nb\nc\n.\nb\nc\n.\nd\ne\nf\n'):
diff --git a/tests/test-check-code.t b/tests/test-check-code.t
--- a/tests/test-check-code.t
+++ b/tests/test-check-code.t
@@ -15,10 +15,6 @@  New errors are not allowed. Warnings are
   Skipping hgext/fsmonitor/pywatchman/msc_stdint.h it has no-che?k-code (glob)
   Skipping hgext/fsmonitor/pywatchman/pybser.py it has no-che?k-code (glob)
   Skipping i18n/polib.py it has no-che?k-code (glob)
-  mercurial/bdiff_module.c:89:
-   > 	//printf("common prefix: %i next line: %i\n", li, llf);
-   don't use //-style comments
   Skipping mercurial/httpclient/__init__.py it has no-che?k-code (glob)
   Skipping mercurial/httpclient/_readers.py it has no-che?k-code (glob)
   Skipping mercurial/statprof.py it has no-che?k-code (glob)
-  [1]