Patchwork [4,of,6,V2] revsbetween: add a C implementation

login
register
mail settings
Submitter Laurent Charignon
Date Aug. 7, 2015, 9:28 a.m.
Message ID <9156d8c324dbaece372d.1438939701@dev919.prn2.facebook.com>
Download mbox | patch
Permalink /patch/10129/
State Superseded
Headers show

Comments

Laurent Charignon - Aug. 7, 2015, 9:28 a.m.
# HG changeset patch
# User Laurent Charignon <lcharignon@fb.com>
# Date 1438921725 25200
#      Thu Aug 06 21:28:45 2015 -0700
# Branch stable
# Node ID 9156d8c324dbaece372df259cb697ad15f10746b
# Parent  4d1b9f4def5070ac3a09dbeb396b745237a49417
revsbetween: add a C implementation

This patch is part of a series of patches to speed up the computation of
revset.revsbetween by introducing a C implementation. The main motivation is to
speed up smartlog on big repositories. At the end of the series, on our big
repositories the computation of revsbetween is 10-50x faster and smartlog on is
2x-5x faster.

This patch introduces a C implementation for revsbetween following closely the
Python implementation but optimized by using C data structures.
Yuya Nishihara - Aug. 7, 2015, 2:42 p.m.
On Fri, 7 Aug 2015 02:28:21 -0700, Laurent Charignon wrote:
> # HG changeset patch
> # User Laurent Charignon <lcharignon@fb.com>
> # Date 1438921725 25200
> #      Thu Aug 06 21:28:45 2015 -0700
> # Branch stable
> # Node ID 9156d8c324dbaece372df259cb697ad15f10746b
> # Parent  4d1b9f4def5070ac3a09dbeb396b745237a49417
> revsbetween: add a C implementation

Just nitpicking...

> +	/* Initialize return set */
> +	reachable = PySet_New(0);

Maybe we should use NULL consistently instead of 0?

> +	if (reachable == NULL) {
> +		PyErr_SetString(PyExc_ValueError, "memory allocation error");
> +		goto bail;
> +	}

IIRC, PyErr_SetString() isn't necessary here. If PyXXX() returns NULL, the
corresponding exception would be set to the current state object.

> +	/* Initialize internal datastructures */
> +	tovisit = (long *)malloc((len + 1) * sizeof(long));

Do we need long width for an array of revisions?

> +	if (tovisit == NULL) {
> +		PyErr_SetString(PyExc_ValueError, "memory allocation error");

PyExc_MemoryError will be more appropriate.

> +release_seen:
> +	if (seen != NULL)
> +		free(seen);

free(NULL) is noop so long as the libc complies with the standard.
Laurent Charignon - Aug. 7, 2015, 5:30 p.m.
> On Aug 7, 2015, at 7:42 AM, Yuya Nishihara <yuya@tcha.org> wrote:
> 
> On Fri, 7 Aug 2015 02:28:21 -0700, Laurent Charignon wrote:
>> # HG changeset patch
>> # User Laurent Charignon <lcharignon@fb.com>
>> # Date 1438921725 25200
>> #      Thu Aug 06 21:28:45 2015 -0700
>> # Branch stable
>> # Node ID 9156d8c324dbaece372df259cb697ad15f10746b
>> # Parent  4d1b9f4def5070ac3a09dbeb396b745237a49417
>> revsbetween: add a C implementation
> 
> Just nitpicking...
> 
>> +	/* Initialize return set */
>> +	reachable = PySet_New(0);
> 
> Maybe we should use NULL consistently instead of 0?

Let's do that in a later patch series if that's okay with you

> 
>> +	if (reachable == NULL) {
>> +		PyErr_SetString(PyExc_ValueError, "memory allocation error");
>> +		goto bail;
>> +	}
> 
> IIRC, PyErr_SetString() isn't necessary here. If PyXXX() returns NULL, the
> corresponding exception would be set to the current state object.

Yes, I checked and that seems right, I will drop that line.
> 
>> +	/* Initialize internal datastructures */
>> +	tovisit = (long *)malloc((len + 1) * sizeof(long));
> 
> Do we need long width for an array of revisions?

No we don't! I will switch to int.

> 
>> +	if (tovisit == NULL) {
>> +		PyErr_SetString(PyExc_ValueError, "memory allocation error");
> 
> PyExc_MemoryError will be more appropriate.

Yes
> 
>> +release_seen:
>> +	if (seen != NULL)
>> +		free(seen);
> 
> free(NULL) is noop so long as the libc complies with the standard.

Ok

Patch

diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -1105,6 +1105,145 @@ 
 		phases[i] = phases[parent_2];
 }
 
+static PyObject *revsbetween(indexObject *self, PyObject *args)
+{
+
+	/* Input */
+	long minroot;
+	PyObject *includepatharg = NULL;
+	int includepath = 0;
+	PyObject *heads = NULL;
+	Py_ssize_t numheads;
+	PyObject *roots = NULL;
+	PyObject *reachable = NULL;
+
+	PyObject *val;
+	Py_ssize_t len = index_length(self) - 1;
+	long revnum;
+	Py_ssize_t k;
+	Py_ssize_t i;
+	int r;
+	int minidx;
+	int parents[2];
+
+	/* Internal data structure:
+	 * tovisit: array of length len+1 (all revs + nullrev), filled upto lentovisit
+	 * seen: array of length len+1 (all revs + nullrev) 0: not seen, 1 seen*/
+	long *tovisit = NULL;
+	long lentovisit = 0;
+	char *seen = NULL;
+
+	/* Get arguments */
+	if (!PyArg_ParseTuple(args, "lOOO", &minroot, &heads, &roots,
+				&includepatharg)) {
+		goto bail;
+	}
+	if (roots == NULL || !PySet_Check(roots))
+		goto bail;
+	if (heads == NULL || !PyList_Check(heads))
+		goto bail;
+	if (includepatharg == NULL || !PyBool_Check(includepatharg))
+		goto bail;
+	if (includepatharg == Py_True)
+		includepath = 1;
+
+	/* Initialize return set */
+	reachable = PySet_New(0);
+	if (reachable == NULL) {
+		PyErr_SetString(PyExc_ValueError, "memory allocation error");
+		goto bail;
+	}
+
+	/* Initialize internal datastructures */
+	tovisit = (long *)malloc((len + 1) * sizeof(long));
+	if (tovisit == NULL) {
+		PyErr_SetString(PyExc_ValueError, "memory allocation error");
+		goto release_reachable;
+	}
+
+	seen = (char *)calloc(len+1, 1);
+	if (seen == NULL) {
+		PyErr_SetString(PyExc_ValueError, "memory allocation error");
+		goto release_tovisit;
+	}
+
+	/* Populate tovisit with all the heads */
+	numheads = PyList_GET_SIZE(heads);
+	for (i = 0; i < numheads; i++) {
+		revnum = PyInt_AS_LONG(PyList_GET_ITEM(heads, i));
+		if (seen[revnum+1] == 0) {
+			tovisit[lentovisit++] = revnum;
+			seen[revnum+1]=1;
+		}
+	}
+
+	/* Visit the tovisit list and find the reachable roots */
+	k = 0;
+	while (k < lentovisit) {
+		/* Add the node to reachable if it is a root*/
+		revnum = tovisit[k++];
+		val = PyInt_FromLong(revnum);
+		if (PySet_Contains(roots, val) == 1) {
+			PySet_Add(reachable, val);
+			if (includepath == 0) {
+				Py_XDECREF(val);
+				continue;
+			}
+		}
+		Py_XDECREF(val);
+
+		/* And its parents to the list of nodes to visit */
+		if (revnum != -1) {
+			r = index_get_parents(self, revnum, parents, (int)len - 1);
+			if (r < 0)
+				goto release_seen;
+
+			for (i = 0; i < 2; i++) {
+				if (seen[parents[i]+1] == 0 && parents[i] >= minroot) {
+					tovisit[lentovisit++] = parents[i];
+					seen[parents[i]+1]=1;
+				}
+			}
+		}
+	}
+
+	/* Find all the nodes in between the roots we found and the heads
+	 * and add them to the reachable set */
+	if (includepath == 1) {
+		minidx = minroot;
+		if (minidx < 0)
+			minidx = 0;
+		for (i = minidx; i < len; i++) {
+			if (seen[i+1] == 1) {
+				r = index_get_parents(self, i, parents, (int)len - 1);
+				/* Corrupted index file, error is set from index_get_parents */
+				if (r < 0)
+					goto release_seen;
+				for (k = 0; k < 2; k++) {
+					PyObject *p = PyInt_FromLong(parents[k]);
+					if (PySet_Contains(reachable, p) == 1)
+						PySet_Add(reachable, PyInt_FromLong(i));
+					Py_XDECREF(p);
+				}
+			}
+		}
+	}
+
+release_seen:
+	if (seen != NULL)
+		free(seen);
+release_tovisit:
+	if (tovisit != NULL)
+		free(tovisit);
+	return reachable;
+release_reachable:
+	Py_XDECREF(reachable);
+bail:
+	val = Py_None;
+	Py_INCREF(Py_None);
+	return val;
+}
+
 static PyObject *compute_phases_map_sets(indexObject *self, PyObject *args)
 {
 	PyObject *roots = Py_None;
@@ -2279,6 +2418,8 @@ 
 	 "get an index entry"},
 	{"computephasesmapsets", (PyCFunction)compute_phases_map_sets,
 			METH_VARARGS, "compute phases"},
+	{"revsbetween", (PyCFunction)revsbetween, METH_VARARGS,
+		"revsbetween"},
 	{"headrevs", (PyCFunction)index_headrevs, METH_VARARGS,
 	 "get head revisions"}, /* Can do filtering since 3.2 */
 	{"headrevsfiltered", (PyCFunction)index_headrevs, METH_VARARGS,