Patchwork D9878: revlog: initial version of phash index [POC]

login
register
mail settings
Submitter phabricator
Date Jan. 26, 2021, 9:02 p.m.
Message ID <differential-rev-PHID-DREV-zgxovmkgguhc47kfltsp-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/48189/
State New
Headers show

Comments

phabricator - Jan. 26, 2021, 9:02 p.m.
joerg.sonnenberger created this revision.
Herald added a reviewer: indygreg.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  Integration is still somewhat hackish, but a reasonable start for a PoC.
  Comparing it to Rust is not entirely fair as the additional Python
  function in between dominates the runtime. The generator is pure Python
  at the moment and at least a factor of 25 slower than the comparable C
  extension. No fallback path for hash function yet to a real universal
  hash (as opposed to just assuming the node hash is well distributed
  enough). Most interesting baseline: pure Python lookup via the hash
  function is a factor of 20 slower than the current dictionary lookup.
  The index generation takes 37s for 2 * 570k revisions (manifest +
  changelog). Further testing to measure the incremental cache update cost
  is necessary, since initial testing shows a negative cost overall. It
  can be estimated to be bound by ~65ms on the test platform.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D9878

AFFECTED FILES
  mercurial/cext/revlog.c
  mercurial/changelog.py
  mercurial/localrepo.py
  mercurial/pure/parsers.py
  mercurial/revlog.py
  mercurial/utils/phash_reader.py
  mercurial/utils/phash_writer.py

CHANGE DETAILS




To: joerg.sonnenberger, indygreg, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/mercurial/utils/phash_writer.py b/mercurial/utils/phash_writer.py
new file mode 100644
--- /dev/null
+++ b/mercurial/utils/phash_writer.py
@@ -0,0 +1,104 @@ 
+import struct
+
+be32 = struct.Struct('>I')
+
+
+def default_hash(key, seed):
+    base = seed * 4
+    return [
+        be32.unpack(key[base : base + 4])[0],
+        be32.unpack(key[base + 4 : base + 8])[0],
+        be32.unpack(key[base + 8 : base + 12])[0],
+    ]
+
+
+class hashtable:
+    def __init__(self, hash=default_hash):
+        self.entries = []
+        self.hash = hash
+
+    def insert(self, key):
+        self.entries.append(key)
+
+    def _hash_entry(self, nv, seed, entry):
+        hash = [x % nv for x in self.hash(entry, seed)]
+        if hash[0] == hash[1]:
+            hash[1] ^= 1
+        if hash[0] == hash[2] or hash[1] == hash[2]:
+            hash[2] ^= 1
+        if hash[0] == hash[2] or hash[1] == hash[2]:
+            hash[2] ^= 2
+
+        return hash
+
+    def _compute_graph(self, ne, nv, seed):
+        edges = []
+        vertices_edges = [0] * nv
+        vertices_degrees = [0] * nv
+
+        for i in range(ne):
+            e = self._hash_entry(nv, seed, self.entries[i])
+            edges.append(e)
+            for v in e:
+                vertices_edges[v] ^= i
+                vertices_degrees[v] += 1
+
+        output_order = []
+
+        def remove_vertex(v):
+            if vertices_degrees[v] != 1:
+                return
+            e = vertices_edges[v]
+            output_order.append(e)
+            for v in edges[e]:
+                vertices_edges[v] ^= e
+                vertices_degrees[v] -= 1
+
+        for v in range(nv):
+            remove_vertex(v)
+        oi = 0
+        while oi < len(output_order):
+            for v in edges[output_order[oi]]:
+                remove_vertex(v)
+            oi += 1
+
+        if len(output_order) == ne:
+            return edges, output_order
+        else:
+            return None
+
+    def write(self):
+        seed = 0
+        ne = len(self.entries)
+        nv = max(128, ne * 124 // 100)
+        nv = (nv | 3) + 1
+
+        rv = None
+        seed = -1
+        while rv is None:
+            seed += 1
+            rv = self._compute_graph(ne, nv, seed)
+        edges, output_order = rv
+
+        g = [0] * nv
+        visited = [False] * nv
+        for idx in reversed(output_order):
+            edge = edges[idx]
+            if not visited[edge[0]]:
+                g[edge[0]] = (idx - g[edge[1]] - g[edge[2]]) % ne
+            elif not visited[edge[1]]:
+                g[edge[1]] = (idx - g[edge[0]] - g[edge[2]]) % ne
+            else:
+                g[edge[2]] = (idx - g[edge[1]] - g[edge[0]]) % ne
+            for v in edge:
+                visited[v] = True
+
+        output = bytearray(12 + 4 * len(g))
+        be32.pack_into(output, 0, seed)
+        be32.pack_into(output, 4, nv)
+        be32.pack_into(output, 8, ne)
+        pos = 12
+        for v in g:
+            be32.pack_into(output, pos, v)
+            pos += 4
+        return output
diff --git a/mercurial/utils/phash_reader.py b/mercurial/utils/phash_reader.py
new file mode 100644
--- /dev/null
+++ b/mercurial/utils/phash_reader.py
@@ -0,0 +1,39 @@ 
+import struct
+
+be32 = struct.Struct('>I')
+
+
+def default_hash(key, seed):
+    base = seed * 4
+    return [
+        be32.unpack(key[base : base + 4])[0],
+        be32.unpack(key[base + 4 : base + 8])[0],
+        be32.unpack(key[base + 8 : base + 12])[0],
+    ]
+
+
+class hashtable:
+    def __init__(self, data, hash=default_hash):
+        self.data = data
+        (self.seed,) = be32.unpack(data[0:4])
+        (self.m,) = be32.unpack(data[4:8])
+        (self.n,) = be32.unpack(data[8:12])
+
+    def get(self, node):
+        base = self.seed * 4
+        h0 = be32.unpack(node[base : base + 4])[0] % self.m
+        h1 = be32.unpack(node[base + 4 : base + 8])[0] % self.m
+        h2 = be32.unpack(node[base + 8 : base + 12])[0] % self.m
+
+        if h0 == h1:
+            h1 ^= 1
+        if h0 == h2 or h1 == h2:
+            h2 ^= 1
+        if h0 == h2 or h1 == h2:
+            h2 ^= 2
+
+        def g(idx):
+            idx = 12 + 4 * idx
+            return be32.unpack(self.data[idx : idx + 4])[0]
+
+        return (g(h0 % self.m) + g(h1 % self.m) + g(h2 % self.m)) % self.n
diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -227,6 +227,61 @@ 
 indexformatv0_unpack = indexformatv0.unpack
 
 
+class WrapperIndex(object):
+    def __init__(self, real_index, pphash, sphash):
+        self.__real_index = real_index
+        self.__pphash = pphash
+        self.__sphash = sphash
+
+    def has_node(self, node):
+        if self.__pphash and self.__pphash.get(node) is not None:
+            return True
+        if self.__sphash and self.__sphash.get(node) is not None:
+            return True
+        return self.__real_index.has_node(node)
+
+    def rev(self, node):
+        if self.__pphash:
+            rev = self.__pphash.get(node)
+            if rev is not None:
+                return rev
+        if self.__sphash:
+            rev = self.__sphash.get(node)
+            if rev is not None:
+                return rev
+        return self.__real_index.rev(node)
+
+    def get_rev(self, node):
+        if self.__pphash:
+            rev = self.__pphash.get(node)
+            if rev is not None:
+                return rev
+        if self.__sphash:
+            rev = self.__sphash.get(node)
+            if rev is not None:
+                return rev
+        return self.__real_index.get_rev(node)
+
+    def _stripnodes(self, start):
+        return self.__real_index.stripnodes(start)
+
+    def clearcaches(self):
+        return self.__real_index.clearcaches()
+
+    def __len__(self):
+        return self.__real_index.__len__()
+        return self._lgt + len(self._extra)
+
+    def append(self, tup):
+        return self.__real_index.append(tup)
+
+    def _check_index(self, i):
+        return self.__real_index._check_index(i)
+
+    def __getitem__(self, i):
+        return self.__real_index.__getitem__(i)
+
+
 class revlogoldindex(list):
     @property
     def nodemap(self):
@@ -385,6 +440,15 @@ 
         return rustrevlog.MixedIndex(index), cache
 
 
+def indexfile_sibling(opener, path, ext):
+    if not path.endswith(b".a"):
+        return path[:-2] + ext
+    pending_path = path[:-4] + ext + b".a"
+    if opener.exists(pending_path):
+        return pending_path
+    return path[:-4] + ext
+
+
 class revlog(object):
     """
     the underlying revision storage object
@@ -448,14 +512,12 @@ 
         self.datafile = datafile or (indexfile[:-2] + b".d")
         self.nodemap_file = None
         if persistentnodemap:
-            if indexfile.endswith(b'.a'):
-                pending_path = indexfile[:-4] + b".n.a"
-                if opener.exists(pending_path):
-                    self.nodemap_file = pending_path
-                else:
-                    self.nodemap_file = indexfile[:-4] + b".n"
-            else:
-                self.nodemap_file = indexfile[:-2] + b".n"
+            self.nodemap_file = indexfile_sibling(opener, indexfile, b".n")
+
+        self._pphash_file = indexfile_sibling(opener, indexfile, b".pphf")
+        self._pphash = None
+        self._sphash_file = indexfile_sibling(opener, indexfile, b".sphf")
+        self._sphash = None
 
         self.opener = opener
         #  When True, indexfile is opened with checkambig=True at writing, to
@@ -670,6 +732,18 @@ 
                         # no changelog tampering
                         self._nodemap_docket = docket
                         index.update_nodemap_data(*nodemap_data)
+            if not self._inline:
+                if self.opener.exists(self._pphash_file):
+                    f = self.opener.open(self._pphash_file)
+                    data = util.mmapread(f)
+                    self._pphash = parsers.phash(data, d[0])
+                if self.opener.exists(self._sphash_file):
+                    f = self.opener.open(self._sphash_file)
+                    data = util.mmapread(f)
+                    self._sphash = parsers.phash(data, d[0])
+                if self._pphash or self._sphash:
+                    d = WrapperIndex(d[0], self._pphash, self._sphash), d[1]
+
         except (ValueError, IndexError):
             raise error.RevlogError(
                 _(b"index %s is corrupted") % self.indexfile
@@ -780,12 +854,52 @@ 
             return False
         return True
 
+    phash_update_delta = 1000
+
+    def _update_pphash(self, transaction):
+        if self._pphash and transaction:
+            if (
+                transaction.changes[b'origrepolen'] // self.phash_update_delta
+                == len(self) // self.phash_update_delta
+            ):
+                return
+        elif len(self) < self.phash_update_delta:
+            return
+        from .utils import phash_writer
+
+        writer = phash_writer.hashtable()
+        for rev in pycompat.xrange(
+            len(self) // self.phash_update_delta * self.phash_update_delta
+        ):
+            writer.insert(self.node(rev))
+        data = writer.write()
+        f = self.opener(self._pphash_file, atomictemp=True, mode=b'wb')
+        f.write(data)
+        f.close()
+
+    def _update_sphash(self, transaction):
+        from .utils import phash_writer
+
+        writer = phash_writer.hashtable()
+        for rev in pycompat.xrange(
+            len(self) // self.phash_update_delta * self.phash_update_delta,
+            len(self),
+        ):
+            writer.insert(self.node(rev))
+        data = writer.write()
+        f = self.opener(self._sphash_file, atomictemp=True, mode=b'wb')
+        f.write(data)
+        f.close()
+
     def update_caches(self, transaction):
         if self.nodemap_file is not None:
             if transaction is None:
                 nodemaputil.update_persistent_nodemap(self)
             else:
                 nodemaputil.setup_persistent_nodemap(transaction, self)
+        if self._pphash_file and not self._inline:
+            self._update_pphash(transaction)
+            self._update_sphash(transaction)
 
     def clearcaches(self):
         self._revisioncache = None
diff --git a/mercurial/pure/parsers.py b/mercurial/pure/parsers.py
--- a/mercurial/pure/parsers.py
+++ b/mercurial/pure/parsers.py
@@ -284,3 +284,24 @@ 
         write(e)
         write(f)
     return cs.getvalue()
+
+
+from ..node import nullid, nullrev
+from ..utils import phash_reader
+
+be32 = struct.Struct('>I')
+
+
+class phash:
+    def __init__(self, data, revlogindex):
+        self.hashtable = phash_reader.hashtable(data)
+        self.revlogindex = revlogindex
+
+    def get(self, node):
+        if node == nullid:
+            return nullrev
+
+        rev = self.hashtable.get(node)
+        if self.revlogindex[rev][7] == node:
+            return rev
+        return None
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py
--- a/mercurial/localrepo.py
+++ b/mercurial/localrepo.py
@@ -2699,12 +2699,12 @@ 
             self.filtered(b'served').branchmap()
             self.filtered(b'served.hidden').branchmap()
 
+        self.changelog.update_caches(transaction=tr)
+        self.manifestlog.update_caches(transaction=tr)
+
         if full:
             unfi = self.unfiltered()
 
-            self.changelog.update_caches(transaction=tr)
-            self.manifestlog.update_caches(transaction=tr)
-
             rbc = unfi.revbranchcache()
             for r in unfi.changelog:
                 rbc.branchinfo(r)
diff --git a/mercurial/changelog.py b/mercurial/changelog.py
--- a/mercurial/changelog.py
+++ b/mercurial/changelog.py
@@ -12,6 +12,7 @@ 
     bin,
     hex,
     nullid,
+    nullrev,
 )
 from .thirdparty import attr
 
diff --git a/mercurial/cext/revlog.c b/mercurial/cext/revlog.c
--- a/mercurial/cext/revlog.c
+++ b/mercurial/cext/revlog.c
@@ -2837,6 +2837,127 @@ 
 	return NULL;
 }
 
+typedef struct {
+	PyObject_HEAD
+	indexObject *idx;
+	int with_buffer;
+	Py_buffer buf;
+	uint32_t m, n, seed;
+} phashObject;
+
+static int phash_init(phashObject *self, PyObject *args)
+{
+	if (!PyArg_ParseTuple(args, "y*O!", &self->buf, &HgRevlogIndex_Type, &self->idx))
+		return -1;
+	self->with_buffer = 1;
+	if (!PyBuffer_IsContiguous(&self->buf, 'C') || self->buf.ndim > 1 || self->buf.len < 12)
+		return -1;
+	self->seed = getbe32(self->buf.buf + 0);
+	self->m = getbe32(self->buf.buf + 4);
+	self->n = getbe32(self->buf.buf + 8);
+	if (self->m == 0 || self->buf.len % 4 != 0 || (self->buf.len - 12) / 4 < self->m)
+		return -1;
+	return 0;
+}
+
+static void phash_dealloc(phashObject *self)
+{
+	if (self->with_buffer)
+		PyBuffer_Release(&self->buf);
+}
+
+static PyObject *phash_get(phashObject *self, PyObject *args)
+{
+	const char *node, *nodedata;
+	Py_ssize_t nodelen;
+	uint32_t h0, h1, h2, rev;
+	if (!PyArg_ParseTuple(args, "y#", &node, &nodelen))
+		Py_RETURN_NONE;
+	if (self->n == 0 || nodelen < 12 || nodelen != self->idx->nodelen)
+		Py_RETURN_NONE;
+
+	if (!memcmp(node, nullid, nodelen))
+		return PyLong_FromLong(-1);
+
+	if (self->seed + 12 > nodelen)
+		Py_RETURN_NONE;
+
+	h0 = getbe32(node + self->seed);
+	h1 = getbe32(node + self->seed + 4);
+	h2 = getbe32(node + self->seed + 8);
+
+	h0 %= self->m;
+	h1 %= self->m;
+	h2 %= self->m;
+
+	h1 ^= (h0 == h1);
+	h2 ^= (h0 == h2 || h1 == h2);
+	h2 ^= 2 * (h0 == h2 || h1 == h2);
+
+	h0 = getbe32(self->buf.buf + 12 + 4 * h0);
+	h1 = getbe32(self->buf.buf + 12 + 4 * h1);
+	h2 = getbe32(self->buf.buf + 12 + 4 * h2);
+
+	rev = (h0 + h1 + h2) % self->n;
+	if (rev >= self->idx->length)
+		Py_RETURN_NONE;
+
+	nodedata = index_deref(self->idx, rev);
+	if (!nodedata)
+		Py_RETURN_NONE;
+
+	if (memcmp(nodedata + 32, node, nodelen))
+		Py_RETURN_NONE;
+
+	return PyLong_FromUnsignedLong(rev);
+}
+
+static PyMethodDef phash_methods[] = {
+    {"get", (PyCFunction)phash_get, METH_VARARGS,
+     "find candidate revision for given node"},
+    {NULL}
+};
+
+static PyTypeObject phashType = {
+    PyVarObject_HEAD_INIT(NULL, 0) /* header */
+    "parsers.phash",               /* tp_name */
+    sizeof(phashObject),           /* tp_basicsize */
+    0,                             /* tp_itemsize */
+    (destructor)phash_dealloc,     /* tp_dealloc */
+    0,                             /* tp_print */
+    0,                             /* tp_getattr */
+    0,                             /* tp_setattr */
+    0,                             /* tp_compare */
+    0,                             /* tp_repr */
+    0,                             /* tp_as_number */
+    0,                             /* tp_as_sequence */
+    0,                             /* tp_as_mapping */
+    0,                             /* tp_hash */
+    0,                             /* tp_call */
+    0,                             /* tp_str */
+    0,                             /* tp_getattro */
+    0,                             /* tp_setattro */
+    0,                             /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,            /* tp_flags */
+    "phash",                       /* tp_doc */
+    0,                             /* tp_traverse */
+    0,                             /* tp_clear */
+    0,                             /* tp_richcompare */
+    0,                             /* tp_weaklistoffset */
+    0,                             /* tp_iter */
+    0,                             /* tp_iternext */
+    phash_methods,                 /* tp_methods */
+    0,                             /* tp_members */
+    0,                             /* tp_getset */
+    0,                             /* tp_base */
+    0,                             /* tp_dict */
+    0,                             /* tp_descr_get */
+    0,                             /* tp_descr_set */
+    0,                             /* tp_dictoffset */
+    (initproc)phash_init,          /* tp_init */
+    0,                             /* tp_alloc */
+};
+
 static Revlog_CAPI CAPI = {
     /* increment the abi_version field upon each change in the Revlog_CAPI
        struct or in the ABI of the listed functions */
@@ -2861,6 +2982,12 @@ 
 	Py_INCREF(&nodetreeType);
 	PyModule_AddObject(mod, "nodetree", (PyObject *)&nodetreeType);
 
+	phashType.tp_new = PyType_GenericNew;
+	if (PyType_Ready(&phashType) < 0)
+		return;
+	Py_INCREF(&phashType);
+	PyModule_AddObject(mod, "phash", (PyObject *)&phashType);
+
 	caps = PyCapsule_New(&CAPI, "mercurial.cext.parsers.revlog_CAPI", NULL);
 	if (caps != NULL)
 		PyModule_AddObject(mod, "revlog_CAPI", caps);