Patchwork D2573: xdiff: remove patience and histogram diffs

login
register
mail settings
Submitter phabricator
Date March 3, 2018, 12:25 a.m.
Message ID <differential-rev-PHID-DREV-kg3rhiofxnkprvxjfhcm-req@phab.mercurial-scm.org>
Download mbox | patch
Permalink /patch/28756/
State Superseded
Headers show

Comments

phabricator - March 3, 2018, 12:25 a.m.
ryanmce created this revision.
Herald added a subscriber: mercurial-devel.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  They are greedy algorithms that yields suboptimal results. Patience diff has
  been advertised as "slower, but generating better results sometimes" for a
  long time. But it's in theory "faster if there are common prefix/suffix
  and/or unique lines, could generate suboptimal results". As demonstrated by
  this test case:
  
    open('a', 'w').write('\n'.join(list('a' + 'x' * 300 + 'u' + 'x' * 700 + 'a\n')))
    open('b', 'w').write('\n'.join(list('b' + 'x' * 700 + 'u' + 'x' * 300 + 'b\n')))
  
  For generating "better" results, the diff sliding heuristic [1] is a better
  solution in general.
  
  So let's just remove patience diff and its variant histogram diff to
  simplify the code.
  
  [1]: https://github.com/git/git/commit/433860f3d0beb0c6f205290bd16cda413148f098

TEST PLAN
  `gcc -fPIC *.c --shared -o xdiff.so` still builds.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D2573

AFFECTED FILES
  mercurial/thirdparty/xdiff/xdiff.h
  mercurial/thirdparty/xdiff/xdiffi.c
  mercurial/thirdparty/xdiff/xprepare.c

CHANGE DETAILS




To: ryanmce, #hg-reviewers
Cc: mercurial-devel
phabricator - March 3, 2018, 8:50 p.m.
indygreg accepted this revision as: indygreg.
indygreg added a comment.


  I'm leaning towards keeping these algorithms so we can expose them as alternate implementations in the future. It will also making syncing code from upstream easier. But removing the unused-by-us code is fine.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D2573

To: ryanmce, #hg-reviewers, indygreg
Cc: indygreg, mercurial-devel
phabricator - March 3, 2018, 9:21 p.m.
quark added a comment.


  Googling "patience diff", most results will say it is slower but has better diff quality sometimes.
  
  That is very misleading - the algorithm adds greediness (i.e. incorrectness) and is in theory faster in some cases. The "better" quality is also untrue comparing to indent heuristic.
  
  Technically, I don't think it's worthwhile to keep - definitely worse quality and unpredictable performance changes comparing to default diff + indent heuristic setup.
  
  From a non-technical point of view, I think educating people that they cannot be misled is very important. This one alone is an enough reason to do the cleanup, imo.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D2573

To: ryanmce, #hg-reviewers, indygreg
Cc: quark, indygreg, mercurial-devel

Patch

diff --git a/mercurial/thirdparty/xdiff/xprepare.c b/mercurial/thirdparty/xdiff/xprepare.c
--- a/mercurial/thirdparty/xdiff/xprepare.c
+++ b/mercurial/thirdparty/xdiff/xprepare.c
@@ -27,7 +27,6 @@ 
 #define XDL_MAX_EQLIMIT 1024
 #define XDL_SIMSCAN_WINDOW 100
 #define XDL_GUESS_NLINES1 256
-#define XDL_GUESS_NLINES2 20
 
 
 typedef struct s_xdlclass {
@@ -181,9 +180,7 @@ 
 	if (!(recs = (xrecord_t **) xdl_malloc(narec * sizeof(xrecord_t *))))
 		goto abort;
 
-	if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
-		hbits = hsize = 0;
-	else {
+	{
 		hbits = xdl_hashbits((unsigned int) narec);
 		hsize = 1 << hbits;
 		if (!(rhash = (xrecord_t **) xdl_malloc(hsize * sizeof(xrecord_t *))))
@@ -209,8 +206,7 @@ 
 			crec->ha = hav;
 			recs[nrec++] = crec;
 
-			if ((XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
-			    xdl_classify_record(pass, cf, rhash, hbits, crec) < 0)
+			if (xdl_classify_record(pass, cf, rhash, hbits, crec) < 0)
 				goto abort;
 		}
 	}
@@ -266,21 +262,12 @@ 
 
 	memset(&cf, 0, sizeof(cf));
 
-	/*
-	 * For histogram diff, we can afford a smaller sample size and
-	 * thus a poorer estimate of the number of lines, as the hash
-	 * table (rhash) won't be filled up/grown. The number of lines
-	 * (nrecs) will be updated correctly anyway by
-	 * xdl_prepare_ctx().
-	 */
-	sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
-		  ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
+	sample = XDL_GUESS_NLINES1;
 
 	enl1 = xdl_guess_lines(mf1, sample) + 1;
 	enl2 = xdl_guess_lines(mf2, sample) + 1;
 
-	if (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF &&
-	    xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
+	if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
 		return -1;
 
 	if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
@@ -295,18 +282,14 @@ 
 		return -1;
 	}
 
-	if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
-	    (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
-	    xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
-
+	if (xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
 		xdl_free_ctx(&xe->xdf2);
 		xdl_free_ctx(&xe->xdf1);
 		xdl_free_classifier(&cf);
 		return -1;
 	}
 
-	if (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)
-		xdl_free_classifier(&cf);
+	xdl_free_classifier(&cf);
 
 	return 0;
 }
diff --git a/mercurial/thirdparty/xdiff/xdiffi.c b/mercurial/thirdparty/xdiff/xdiffi.c
--- a/mercurial/thirdparty/xdiff/xdiffi.c
+++ b/mercurial/thirdparty/xdiff/xdiffi.c
@@ -328,12 +328,6 @@ 
 	xdalgoenv_t xenv;
 	diffdata_t dd1, dd2;
 
-	if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF)
-		return xdl_do_patience_diff(mf1, mf2, xpp, xe);
-
-	if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
-		return xdl_do_histogram_diff(mf1, mf2, xpp, xe);
-
 	if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
 
 		return -1;
diff --git a/mercurial/thirdparty/xdiff/xdiff.h b/mercurial/thirdparty/xdiff/xdiff.h
--- a/mercurial/thirdparty/xdiff/xdiff.h
+++ b/mercurial/thirdparty/xdiff/xdiff.h
@@ -27,6 +27,8 @@ 
 extern "C" {
 #endif /* #ifdef __cplusplus */
 
+#include <stddef.h> /* size_t */
+
 /* xpparm_t.flags */
 #define XDF_NEED_MINIMAL (1 << 0)
 
@@ -41,11 +43,6 @@ 
 
 #define XDF_IGNORE_BLANK_LINES (1 << 7)
 
-#define XDF_PATIENCE_DIFF (1 << 14)
-#define XDF_HISTOGRAM_DIFF (1 << 15)
-#define XDF_DIFF_ALGORITHM_MASK (XDF_PATIENCE_DIFF | XDF_HISTOGRAM_DIFF)
-#define XDF_DIFF_ALG(x) ((x) & XDF_DIFF_ALGORITHM_MASK)
-
 #define XDF_INDENT_HEURISTIC (1 << 23)
 
 /* xdemitconf_t.flags */