Patchwork [4,of,4,v3,lazy-manifest] lazymanifest: use a binary search to do an insertion

login
register
mail settings
Submitter Augie Fackler
Date Feb. 6, 2015, 11:24 p.m.
Message ID <d6caa05bf1777676f1d3.1423265069@130.17.16.172.in-addr.arpa>
Download mbox | patch
Permalink /patch/7729/
State Superseded
Commit 542c891274b22bda5ce1cbc6b2b8a8caab9bfd48
Headers show

Comments

Augie Fackler - Feb. 6, 2015, 11:24 p.m.
# HG changeset patch
# User Augie Fackler <augie@google.com>
# Date 1422682240 28800
#      Fri Jan 30 21:30:40 2015 -0800
# Node ID d6caa05bf1777676f1d3906a4d4d2931fca0f9d9
# Parent  b436d0343da36fd8b511754cae10d579de6b6318
lazymanifest: use a binary search to do an insertion

This makes insertions log(n) plus some memmove() overhead, rather than
doing an append followed by an n*log(n) sort. There's probably a lot
of performance to be gained by adding a batch-add method, which could
be smarter about the memmove()s performed.

Includes a couple of extra tests that should help prevent bugs.

Thanks to Martin for some significant pre-mail cleanup of this change.

Patch

diff --git a/mercurial/manifest.c b/mercurial/manifest.c
--- a/mercurial/manifest.c
+++ b/mercurial/manifest.c
@@ -363,6 +363,39 @@  static int lazymanifest_delitem(lazymani
 	return 0;
 }
 
+/* Do a binary search for the insertion point for new, creating the
+ * new entry if needed. */
+static int internalsetitem(lazymanifest *self, line *new) {
+	int start = 0, end = self->numlines;
+	while (start < end) {
+		int pos = start + (end - start) / 2;
+		int c = linecmp(new, self->lines + pos);
+		if (c < 0)
+			end = pos;
+		else if (c > 0)
+			start = pos + 1;
+		else {
+			if (self->lines[pos].deleted)
+				self->livelines++;
+			start = pos;
+			goto finish;
+		}
+	}
+	/* being here means we need to do an insert */
+	if (!realloc_if_full(self)) {
+		PyErr_NoMemory();
+		return -1;
+	}
+	memmove(self->lines + start + 1, self->lines + start,
+		(self->numlines - start) * sizeof(line));
+	self->numlines++;
+	self->livelines++;
+finish:
+	self->lines[start] = *new;
+	self->dirty = true;
+	return 0;
+}
+
 static int lazymanifest_setitem(
 	lazymanifest *self, PyObject *key, PyObject *value)
 {
@@ -378,7 +411,6 @@  static int lazymanifest_setitem(
 	char *dest;
 	int i;
 	line new;
-	line *hit;
 	if (!PyString_Check(key)) {
 		PyErr_Format(PyExc_TypeError,
 			     "setitem: manifest keys must be a string.");
@@ -446,32 +478,8 @@  static int lazymanifest_setitem(
 	}
 	new.from_malloc = true;     /* is `start` a pointer we allocated? */
 	new.deleted = false;        /* is this entry deleted? */
-	hit = bsearch(&new, self->lines, self->numlines,
-		      sizeof(line), &linecmp);
-	self->dirty = true;
-	if (hit) {
-		/* updating a line we already had */
-		if (hit->from_malloc) {
-			free(hit->start);
-		}
-		if (hit->deleted) {
-			self->livelines++;
-		}
-		*hit = new;
-	} else {
-		/* new line */
-		if (!realloc_if_full(self)) {
-			PyErr_NoMemory();
-			return -1;
-		}
-		self->lines[self->numlines++] = new;
-		self->livelines++;
-		/* TODO: do a binary search and insert rather than
-		 * append and qsort. Also offer a batch-insert
-		 * interface, which should be a nice little
-		 * performance win.
-		 */
-		qsort(self->lines, self->numlines, sizeof(line), &linecmp);
+	if (internalsetitem(self, &new)) {
+		return -1;
 	}
 	return 0;
 }
diff --git a/tests/test-manifest.py b/tests/test-manifest.py
--- a/tests/test-manifest.py
+++ b/tests/test-manifest.py
@@ -134,6 +134,10 @@  class testmanifest(unittest.TestCase):
         self.assertRaises(KeyError, lambda : m['foo'])
         self.assertEqual(1, len(m))
         self.assertEqual(1, len(list(m)))
+        # now restore and make sure everything works right
+        m['foo'] = 'a' * 20, ''
+        self.assertEqual(2, len(m))
+        self.assertEqual(2, len(list(m)))
 
     def testManifestDiff(self):
         MISSING = (None, '')