Patchwork [1,of,4,lazy-manifest,v2] manifest.c: new extension code to lazily parse manifests

login
register
mail settings
Submitter Augie Fackler
Date Jan. 12, 2015, 8:04 p.m.
Message ID <b9d67bf3ea8196c8530a.1421093086@augie-macbookair2.roam.corp.google.com>
Download mbox | patch
Permalink /patch/7439/
State Superseded
Commit a5f1bccd2996e76d1d9de82c37129078151e240d
Headers show

Comments

Augie Fackler - Jan. 12, 2015, 8:04 p.m.
# HG changeset patch
# User Augie Fackler <augie@google.com>
# Date 1420871231 28800
#      Fri Jan 09 22:27:11 2015 -0800
# Node ID b9d67bf3ea8196c8530afdcf6b1727682b681259
# Parent  678f53865c6860a950392691814766957ee89316
manifest.c: new extension code to lazily parse manifests

This lets us iterate manifests in order, but do a _lot_ less work in
the common case when we only care about a few manifest entries.

Many thanks to Mike Edgar for reviewing this in advance of it going
out to the list, which caught many things I missed.

This version of the patch includes C89 fixes from Sean Farley and
many correctness/efficiency cleanups from Martin von
Zweigbergk. Thanks to both!
Martin von Zweigbergk - Jan. 13, 2015, 12:42 a.m.
I've pushed another set of commits to https://bitbucket.org/martinvonz/hg.
See if you want anything before freeze or if you want to wait with them all
until after.

Other than those comments/commits, it looks good to me.

I was surprised at first that the tuples return by __getitem__ are not
cached once created. Did you experiment with that? If not, it seems like
this version is faster than the pure Python version anyway, so I suppose
such experimentation can be left for later.

On Mon Jan 12 2015 at 12:05:17 PM Augie Fackler <raf@durin42.com> wrote:

> # HG changeset patch
> # User Augie Fackler <augie@google.com>
> # Date 1420871231 28800
> #      Fri Jan 09 22:27:11 2015 -0800
> # Node ID b9d67bf3ea8196c8530afdcf6b1727682b681259
> # Parent  678f53865c6860a950392691814766957ee89316
> manifest.c: new extension code to lazily parse manifests
>
> This lets us iterate manifests in order, but do a _lot_ less work in
> the common case when we only care about a few manifest entries.
>
> Many thanks to Mike Edgar for reviewing this in advance of it going
> out to the list, which caught many things I missed.
>
> This version of the patch includes C89 fixes from Sean Farley and
> many correctness/efficiency cleanups from Martin von
> Zweigbergk. Thanks to both!
>
> diff --git a/mercurial/manifest.c b/mercurial/manifest.c
> new file mode 100644
> --- /dev/null
> +++ b/mercurial/manifest.c
> @@ -0,0 +1,865 @@
> +/*
> + * manifest.c - manifest type that does on-demand parsing.
> + *
> + * Copyright 2015, Google Inc.
> + *
> + * This software may be used and distributed according to the terms of
> + * the GNU General Public License, incorporated herein by reference.
> + */
> +#include <assert.h>
> +#include <string.h>
> +#include <stdlib.h>
> +
> +#include <Python.h>
> +
> +/* VC9 doesn't include bool and lacks stdbool.h based on my searching */
> +#ifdef _MSC_VER
> +#define true 1
> +#define false 0
> +typedef unsigned char bool;
> +#else
> +#include <stdbool.h>
> +#endif
> +
> +#define DEFAULT_LINES 100000
> +
> +typedef struct {
> +       char *start;
> +       Py_ssize_t len; /* length of line not including terminal newline */
> +       char hash_suffix;
> +       bool from_malloc;
> +       bool deleted;
> +} line;
> +
> +typedef struct {
> +       PyObject_HEAD
> +       PyObject *pydata;
> +       line *lines;
> +       int numlines; /* number of line entries */
> +       int livelines; /* number of non-deleted lines */
> +       int maxlines; /* allocated number of lines */
> +       bool dirty;
> +} lazymanifest;
> +
> +#define MANIFEST_OOM -1
> +#define MANIFEST_NOT_SORTED -2
> +#define MANIFEST_MALFORMED -3
> +
> +/* defined in parsers.c */
> +PyObject *unhexlify(const char *str, int len);
> +
> +/* get the length of the path for a line */
> +static size_t pathlen(line *l) {
> +       return strnlen(l->start, l->len);
> +}
> +
> +/* get the node value of a single line */
> +static PyObject *nodeof(line *l) {
> +       char *s = l->start;
> +       ssize_t llen = pathlen(l);
> +       PyObject *hash = unhexlify(s + llen + 1, 40);
> +       if (!hash) {
> +               return NULL;
> +       }
> +       if (l->hash_suffix != '\0') {
> +               char newhash[21];
> +               memcpy(newhash, PyString_AsString(hash), 20);
> +               Py_DECREF(hash);
> +               newhash[20] = l->hash_suffix;
> +               hash = PyString_FromStringAndSize(newhash, 21);
> +       }
> +       return hash;
> +}
> +
> +/* get the node hash and flags of a line as a tuple */
> +static PyObject *hashflags(line *l)
> +{
> +       char *s = l->start;
> +       size_t plen = pathlen(l);
> +       PyObject *hash = nodeof(l);
> +
> +       /* 40 for hash, 1 for null byte, 1 for newline */
> +       size_t hplen = plen + 42;
> +       Py_ssize_t flen = l->len - hplen;
> +       PyObject *flags;
> +       PyObject *tup;
> +
> +       if (!hash)
> +               return NULL;
> +       if (flen > 0) {
> +               flags = PyString_FromStringAndSize(s + hplen - 1, flen);
> +       } else {
> +               flags = PyString_FromString("");
> +       }
> +       if (!flags) {
> +               Py_DECREF(hash);
> +               return NULL;
> +       }
> +       tup = PyTuple_Pack(2, hash, flags);
> +       Py_DECREF(flags);
> +       Py_DECREF(hash);
> +       return tup;
> +}
> +
> +/* if we're about to run out of space in the line index, add more */
> +static bool realloc_if_full(lazymanifest *self)
> +{
> +       if (self->numlines == self->maxlines) {
> +               self->maxlines *= 2;
> +               self->lines = realloc(self->lines, self->maxlines *
> sizeof(line));
> +       }
> +       return self->lines;
> +}
> +
> +/*
> + * Find the line boundaries in the manifest that 'data' points to and
> store
> + * information about each line in 'self'.
> + */
> +static int find_lines(lazymanifest *self, char *data, Py_ssize_t len)
> +{
> +       char *prev = NULL;
> +       line *l = NULL;
> +       while (len > 0) {
> +               char *next = memchr(data, '\n', len);
> +               if (!next) {
> +                       return MANIFEST_MALFORMED;
> +               }
> +               next++; /* advance past newline */
> +               if (!realloc_if_full(self)) {
> +                       return MANIFEST_OOM; /* no memory */
> +               }
> +               if (prev && strcmp(prev, data) > -1) {
> +                       /* This data isn't sorted, so we have to abort. */
> +                       return MANIFEST_NOT_SORTED;
> +               }
> +               l = self->lines + ((self->numlines)++);
> +               l->start = data;
> +               l->len = next - data;
> +               l->hash_suffix = '\0';
> +               l->from_malloc = false;
> +               l->deleted = false;
> +               len = len - l->len;
> +               prev = data;
> +               data = next;
> +       }
> +       self->livelines = self->numlines;
> +       return 0;
> +}
> +
> +static int lazymanifest_init(lazymanifest *self, PyObject *args)
> +{
> +       char *data;
> +       Py_ssize_t len;
> +       int err, ret;
> +       PyObject *pydata;
> +       if (!PyArg_ParseTuple(args, "S", &pydata)) {
> +               return -1;
> +       }
> +       err = PyString_AsStringAndSize(pydata, &data, &len);
> +
> +       self->dirty = false;
> +       if (err == -1)
> +               return -1;
> +       self->pydata = pydata;
> +       Py_INCREF(self->pydata);
> +       Py_BEGIN_ALLOW_THREADS
> +       self->lines = malloc(DEFAULT_LINES * sizeof(line));
> +       self->maxlines = DEFAULT_LINES;
> +       self->numlines = 0;
> +       if (!self->lines)
> +               ret = -2;
> +       else
> +               ret = find_lines(self, data, len);
> +       Py_END_ALLOW_THREADS
> +       switch (ret) {
> +       case 0:
> +               break;
> +       case MANIFEST_OOM:
> +               PyErr_NoMemory();
> +               break;
> +       case MANIFEST_NOT_SORTED:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Manifest lines not in sorted order.");
> +               break;
> +       case MANIFEST_MALFORMED:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Manifest did not end in a newline.");
> +               break;
> +       default:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Unknown problem parsing manifest.");
> +       }
> +       return ret == 0 ? 0 : -1;
> +}
> +
> +static void lazymanifest_dealloc(lazymanifest *self)
> +{
> +       /* free any extra lines we had to allocate */
> +       int i;
> +       for (i = 0; i < self->numlines; i++) {
> +               if (self->lines[i].from_malloc) {
> +                       free(self->lines[i].start);
> +               }
> +       }
> +       if (self->lines) {
> +               free(self->lines);
> +               self->lines = NULL;
> +       }
> +       if (self->pydata) {
> +               Py_DECREF(self->pydata);
> +               self->pydata = NULL;
> +       }
> +       PyObject_Del(self);
> +}
> +
> +/* iteration support */
> +
> +typedef struct {
> +       PyObject_HEAD lazymanifest *m;
> +       Py_ssize_t pos;
> +} lmIter;
> +
> +static void lmiter_dealloc(PyObject *o)
> +{
> +       lmIter *self = (lmIter *)o;
> +       Py_DECREF(self->m);
> +       PyObject_Del(self);
> +}
> +
> +static PyObject *lmiter_iternext(PyObject *o)
> +{
> +       size_t pl;
> +       line *l;
> +       Py_ssize_t consumed;
> +       PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL;
> +       lmIter *self = (lmIter *)o;
> +       Py_ssize_t index = self->pos++;
> +       /* skip over deleted manifest entries */
> +       while (index < self->m->numlines &&
> +              self->m->lines[index].deleted) {
> +               index = ++(self->pos);
> +       }
> +       if (self->m->numlines <= index) {
> +               goto bail;
> +       }
> +       l = &(self->m->lines[index]);
> +       pl = pathlen(l);
> +       path = PyString_FromStringAndSize(l->start, pl);
> +       hash = nodeof(l);
> +       consumed = pl + 41;
> +       if (l->len > (consumed + 1)) {
> +               flags = PyString_FromStringAndSize(l->start + consumed,
> +                                                  l->len - consumed - 1);
> +       } else {
> +               flags = PyString_FromString("");
> +
> +       }
> +       if (!flags) {
> +               goto bail;
> +       }
> +       ret = PyTuple_Pack(3, path, hash, flags);
> + bail:
> +       Py_XDECREF(path);
> +       Py_XDECREF(hash);
> +       Py_XDECREF(flags);
> +       return ret;
> +}
> +
> +static PyTypeObject lazymanifestIterator = {
> +       PyObject_HEAD_INIT(NULL)
> +       0,                               /*ob_size */
> +       "parsers.lazymanifest.iterator", /*tp_name */
> +       sizeof(lmIter),                  /*tp_basicsize */
> +       0,                               /*tp_itemsize */
> +       lmiter_dealloc,                  /*tp_dealloc */
> +       0,                               /*tp_print */
> +       0,                               /*tp_getattr */
> +       0,                               /*tp_setattr */
> +       0,                               /*tp_compare */
> +       0,                               /*tp_repr */
> +       0,                               /*tp_as_number */
> +       0,                               /*tp_as_sequence */
> +       0,                               /*tp_as_mapping */
> +       0,                               /*tp_hash */
> +       0,                               /*tp_call */
> +       0,                               /*tp_str */
> +       0,                               /*tp_getattro */
> +       0,                               /*tp_setattro */
> +       0,                               /*tp_as_buffer */
> +       /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
> +          use tp_iter and tp_iternext fields. */
> +       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
> +       "Iterator for a lazymanifest.",  /* tp_doc */
> +       0,                               /* tp_traverse */
> +       0,                               /* tp_clear */
> +       0,                               /* tp_richcompare */
> +       0,                               /* tp_weaklistoffset */
> +       PyObject_SelfIter,               /* tp_iter: __iter__() method */
> +       lmiter_iternext,                 /* tp_iternext: next() method */
> +};
> +
> +static lazymanifest *lazymanifest_copy(
> +       lazymanifest *self, PyObject *unused_args);
> +
> +static PyObject *lazymanifest_getiter(lazymanifest *self)
> +{
> +       lmIter *i = NULL;
> +       lazymanifest *t = lazymanifest_copy(self, NULL);
> +       if (!t) {
> +               PyErr_NoMemory();
> +               return NULL;
> +       }
> +       i = PyObject_New(lmIter, &lazymanifestIterator);
> +       if (i) {
> +               i->m = t;
> +               i->pos = 0;
> +       } else {
> +               Py_DECREF(t);
> +               PyErr_NoMemory();
> +       }
> +       return (PyObject *)i;
> +}
> +
> +/* __getitem__ and __setitem__ support */
> +
> +static Py_ssize_t lazymanifest_size(lazymanifest *self)
> +{
> +       return self->livelines;
> +}
> +
> +static int linecmp(const void *left, const void *right)
> +{
> +       return strcmp(((const line *)left)->start,
> +                     ((const line *)right)->start);
> +}
> +
> +static PyObject *lazymanifest_getitem(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "getitem: manifest keys must be a string.");
> +               return NULL;
> +       }
> +       needle.start = PyString_AsString(key);
> +       needle.len = 0;
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               PyErr_Format(PyExc_KeyError, "No such manifest entry.");
> +               return NULL;
> +       }
> +       return hashflags(hit);
> +}
> +
> +static int lazymanifest_delitem(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "delitem: manifest keys must be a string.");
> +               return -1;
> +       }
> +       needle.start = PyString_AsString(key);
> +       needle.len = 0;
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               PyErr_Format(PyExc_KeyError,
> +                            "Tried to delete nonexistent manifest
> entry.");
> +               return -1;
> +       }
> +       self->dirty = true;
> +       hit->deleted = true;
> +       self->livelines--;
> +       return 0;
> +}
> +
> +static int lazymanifest_setitem(
> +       lazymanifest *self, PyObject *key, PyObject *value)
> +{
> +       char *path;
> +       Py_ssize_t plen;
> +       PyObject *pyhash;
> +       Py_ssize_t hlen;
> +       char *hash;
> +       char hashextra;
> +       PyObject *pyflags;
> +       char *flags;
> +       Py_ssize_t flen;
> +       size_t dlen;
> +       char *dest;
> +       int i;
> +       line new;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "setitem: manifest keys must be a string.");
> +               return -1;
> +       }
> +       if (!value) {
> +               return lazymanifest_delitem(self, key);
> +       }
> +       if (!PyTuple_Check(value) || PyTuple_Size(value) != 2) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "Manifest values must be a tuple of (node,
> flags).");
> +               return -1;
> +       }
> +       if (PyString_AsStringAndSize(key, &path, &plen) == -1) {
> +               return -1;
> +       }
> +
> +       pyhash = PyTuple_GetItem(value, 0);
> +       if (!PyString_Check(pyhash)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "node must be a 20-byte string");
> +               return -1;
> +       }
> +       hlen = PyString_Size(pyhash);
> +       /* Some parts of the codebase try and set 21 or 22
> +        * byte "hash" values in order to perturb things for
> +        * status. We have to preserve at least the 21st
> +        * byte. Sigh. If there's a 22nd byte, we drop it on
> +        * the floor, which works fine.
> +        */
> +       if (hlen != 20 && hlen != 21 && hlen != 22) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "node must be a 20-byte string");
> +               return -1;
> +       }
> +       hash = PyString_AsString(pyhash);
> +       hashextra = '\0';
> +       if (hlen > 20) {
> +               hashextra = hash[20];
> +       }
> +
> +       pyflags = PyTuple_GetItem(value, 1);
> +       if (!PyString_Check(pyflags) || PyString_Size(pyflags) > 1) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "flags must a 0 or 1 byte string");
> +               return -1;
> +       }
> +       if (PyString_AsStringAndSize(pyflags, &flags, &flen) == -1) {
> +               return -1;
> +       }
> +       /* 3 -> two null bytes and one newline */
> +       dlen = plen + 40 + flen + 3;
> +       dest = malloc(dlen);
> +       if (!dest) {
> +               PyErr_NoMemory();
> +               return -1;
> +       }
> +       memcpy(dest, path, plen + 1);
> +       for (i = 0; i < 20; i++) {
> +               sprintf(dest + plen + 1 + (i * 2), "%02hhx", hash[i]);
> +       }
> +       sprintf(dest + plen + 41, "%s\n", flags);
> +       new.start = dest;
> +       new.len = dlen - 1;
> +       new.hash_suffix = hashextra;
> +       new.from_malloc = true;     /* is `start` a pointer we allocated?
> */
> +       new.deleted = false;        /* is this entry deleted? */
> +       hit = bsearch(&new, self->lines, self->numlines,
> +                     sizeof(line), &linecmp);
> +       self->dirty = true;
> +       if (hit) {
> +               /* updating a line we already had */
> +               if (hit->from_malloc) {
> +                       free(hit->start);
> +               }
> +               if (hit->deleted) {
> +                       self->livelines++;
> +               }
> +               *hit = new;
> +       } else {
> +               /* new line */
> +               if (!realloc_if_full(self)) {
> +                       PyErr_NoMemory();
> +                       return -1;
> +               }
> +               self->lines[self->numlines++] = new;
> +               self->livelines++;
> +               /* TODO: do a binary search and insert rather than
> +                * append and qsort. Also offer a batch-insert
> +                * interface, which should be a nice little
> +                * performance win.
> +                */
> +               qsort(self->lines, self->numlines, sizeof(line), &linecmp);
> +       }
> +       return 0;
> +}
> +
> +static PyMappingMethods lazymanifest_mapping_methods = {
> +       (lenfunc)lazymanifest_size,             /* mp_length */
> +       (binaryfunc)lazymanifest_getitem,       /* mp_subscript */
> +       (objobjargproc)lazymanifest_setitem,    /* mp_ass_subscript */
> +};
> +
> +/* sequence methods (important or __contains__ builds an iterator */
> +
> +static int lazymanifest_contains(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "contains: manifest keys must be a string.");
> +               return 0;
> +       }
> +       needle.start = PyString_AsString(key);
> +       needle.len = 0;
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               return 0;
> +       }
> +       return 1;
> +}
> +
> +static PySequenceMethods lazymanifest_seq_meths = {
> +       (lenfunc)lazymanifest_size, /* sq_length */
> +       0, /* sq_concat */
> +       0, /* sq_repeat */
> +       0, /* sq_item */
> +       0, /* sq_slice */
> +       0, /* sq_ass_item */
> +       0, /* sq_ass_slice */
> +       (objobjproc)lazymanifest_contains, /* sq_contains */
> +       0, /* sq_inplace_concat */
> +       0, /* sq_inplace_repeat */
> +};
> +
> +
> +/* Other methods (copy, diff, etc) */
> +static PyTypeObject lazymanifestType;
> +
> +/* If the manifest has changes, build the new manifest text and reindex
> it. */
> +static int compact(lazymanifest *self) {
> +       int i;
> +       ssize_t need = 0;
> +       char *data;
> +       ssize_t copied = 0;
> +       int real = 0;
> +       line *src, *dst;
> +       char *tofree;
> +       ssize_t c;
> +       PyObject *pydata;
> +       char *realdest;
> +       int err;
> +       Py_ssize_t offset;
> +       line new;
> +       if (!self->dirty)
> +               return 0;
> +       for (i = 0; i < self->numlines; i++) {
> +               if (!self->lines[i].deleted) {
> +                       need += self->lines[i].len;
> +               }
> +       }
> +       pydata = PyString_FromStringAndSize(NULL, need);
> +       if (!pydata)
> +               return -1;
> +       data = PyString_AsString(pydata);
> +       if (!data) {
> +               return -1;
> +       }
> +       src = self->lines;
> +       dst = self->lines;
> +       for (i = 0; i < self->numlines; i++, src++) {
> +               tofree = NULL;
> +               if (src->from_malloc) {
> +                       tofree = src->start;
> +               }
> +               if (!src->deleted) {
> +                       memcpy(data, src->start, src->len);
> +                       *dst = *src;
> +                       dst->start = data;
> +                       dst->from_malloc = false;
> +                       data += dst->len;
> +                       dst++;
> +               }
> +               free(tofree);
> +       }
> +       Py_DECREF(self->pydata);
> +       self->pydata = pydata;
> +       self->numlines = self->livelines;
> +       self->dirty = false;
> +       return 0;
> +}
> +
> +static PyObject *lazymanifest_text(lazymanifest *self, PyObject
> *unused_args)
> +{
> +       if (compact(self) != 0) {
> +               PyErr_NoMemory();
> +               return NULL;
> +       }
> +       Py_INCREF(self->pydata);
> +       return self->pydata;
> +}
> +
> +static lazymanifest *lazymanifest_copy(
> +       lazymanifest *self, PyObject *unused_args)
> +{
> +       lazymanifest *copy = NULL;
> +       if (compact(self) != 0) {
> +               goto nomem;
> +       }
> +       copy = PyObject_New(lazymanifest, &lazymanifestType);
> +       if (!copy) {
> +               goto nomem;
> +       }
> +       copy->numlines = self->numlines;
> +       copy->livelines = self->livelines;
> +       copy->dirty = false;
> +       copy->lines = malloc(self->maxlines *sizeof(line));
> +       if (!copy->lines) {
> +               goto nomem;
> +       }
> +       memcpy(copy->lines, self->lines, self->numlines * sizeof(line));
> +       copy->maxlines = self->maxlines;
> +       copy->pydata = self->pydata;
> +       Py_INCREF(copy->pydata);
> +       return copy;
> + nomem:
> +       PyErr_NoMemory();
> +       Py_XDECREF(copy);
> +       return NULL;
> +}
> +
> +static lazymanifest *lazymanifest_filtercopy(
> +       lazymanifest *self, PyObject *matchfn)
> +{
> +       lazymanifest *tmp;
> +       PyObject *arg;
> +       PyObject *arglist;
> +       PyObject *result;
> +       int i;
> +       if (!PyCallable_Check(matchfn)) {
> +               PyErr_SetString(PyExc_TypeError, "matchfn must be
> callable");
> +               return NULL;
> +       }
> +       /* compact ourselves first to avoid double-frees later when we
> +        * compact tmp so that it doesn't have random pointers to our
> +        * underlying data */
> +       compact(self);
> +       tmp = PyObject_New(lazymanifest, &lazymanifestType);
> +       tmp->lines = malloc(self->maxlines * sizeof(line));
> +       tmp->maxlines = self->maxlines;
> +       tmp->livelines = 0;
> +       tmp->numlines = 0;
> +       tmp->pydata = self->pydata;
> +       Py_INCREF(self->pydata);
> +       for (i = 0; i < self->numlines; i++) {
> +               arg = PyString_FromString(self->lines[i].start);
> +               arglist = PyTuple_Pack(1, arg);
> +               result = PyObject_CallObject(matchfn, arglist);
> +               /* if the callback raised an exception, just let it
> +                * through and give up */
> +               if (!result) {
> +                       free(tmp->lines);
> +                       tmp->lines = NULL;
> +                       Py_DECREF(self->pydata);
> +                       return NULL;
> +               }
> +               if (PyObject_IsTrue(result)) {
> +                       assert(!(self->lines[i].from_malloc));
> +                       tmp->lines[tmp->numlines++] = self->lines[i];
> +                       tmp->livelines++;
> +               }
> +               Py_DECREF(arglist);
> +               Py_DECREF(arg);
> +               Py_DECREF(result);
> +       }
> +       compact(tmp);
> +       return tmp;
> +}
> +
> +static PyObject *lazymanifest_diff(lazymanifest *self, PyObject *args)
> +{
> +       lazymanifest *other;
> +       PyObject *pyclean = NULL;
> +       bool clean;
> +       PyObject *emptyTup = NULL, *ret = NULL;
> +       PyObject *es;
> +       int sneedle = 0, oneedle = 0;
> +       line *left;
> +       line *right;
> +       int result;
> +       PyObject *key;
> +       PyObject *tup;
> +       PyObject *outer;
> +       PyObject *r;
> +       if (!PyArg_ParseTuple(args, "O!|O", &lazymanifestType, &other,
> &pyclean)) {
> +               return NULL;
> +       }
> +       clean = (!pyclean) ? false : PyObject_IsTrue(pyclean);
> +       es = PyString_FromString("");
> +       if (!es) {
> +               goto nomem;
> +       }
> +       emptyTup = PyTuple_Pack(2, Py_None, es);
> +       Py_DECREF(es);
> +       if (!emptyTup) {
> +               goto nomem;
> +       }
> +       ret = PyDict_New();
> +       if (!ret) {
> +               goto nomem;
> +       }
> +       while (sneedle != self->numlines || oneedle != other->numlines) {
> +               left = self->lines + sneedle;
> +               right = other->lines + oneedle;
> +               /* If we're looking at a deleted entry and it's not
> +                * the end of the manifest, just skip it. */
> +               if (left->deleted && sneedle < self->numlines) {
> +                       sneedle++;
> +                       continue;
> +               }
> +               if (right->deleted && oneedle < other->numlines) {
> +                       oneedle++;
> +                       continue;
> +               }
> +               /* if we're at the end of either manifest, then we
> +                * know the remaining items are adds so we can skip
> +                * the strcmp. */
> +               if (sneedle == self->numlines) {
> +                       result = 1;
> +               } else if (oneedle == other->numlines) {
> +                       result = -1;
> +               } else {
> +                       result = linecmp(left, right);
> +               }
> +               key = result <= 0 ?
> +                       PyString_FromString(left->start) :
> +                       PyString_FromString(right->start);
> +               if (!key)
> +                       goto nomem;
> +               if (result < 0) {
> +                       tup = hashflags(left);
> +                       if (!tup) {
> +                               goto nomem;
> +                       }
> +                       outer = PyTuple_Pack(2, tup, emptyTup);
> +                       Py_DECREF(tup);
> +                       if (!outer) {
> +                               goto nomem;
> +                       }
> +                       PyDict_SetItem(ret, key, outer);
> +                       Py_DECREF(outer);
> +                       sneedle++;
> +               } else if (result > 0) {
> +                       tup = hashflags(right);
> +                       if (!tup) {
> +                               goto nomem;
> +                       }
> +                       outer = PyTuple_Pack(2, emptyTup, tup);
> +                       Py_DECREF(tup);
> +                       if (!outer) {
> +                               goto nomem;
> +                       }
> +                       PyDict_SetItem(ret, key, outer);
> +                       Py_DECREF(outer);
> +                       oneedle++;
> +               } else {
> +                       /* file exists in both manifests */
> +                       if (left->len != right->len
> +                           || memcmp(left->start, right->start, left->len)
> +                           || left->hash_suffix != right->hash_suffix) {
> +                               PyObject *l = hashflags(left);
> +                               if (!l) {
> +                                       goto nomem;
> +                               }
> +                               r = hashflags(right);
> +                               if (!r) {
> +                                       Py_DECREF(l);
> +                                       goto nomem;
> +                               }
> +                               outer = PyTuple_Pack(2, l, r);
> +                               Py_DECREF(l);
> +                               Py_DECREF(r);
> +                               if (!outer) {
> +                                       goto nomem;
> +                               }
> +                               PyDict_SetItem(ret, key, outer);
> +                               Py_DECREF(outer);
> +                       } else if (clean) {
> +                               PyDict_SetItem(ret, key, Py_None);
> +                       }
> +                       sneedle++;
> +                       oneedle++;
> +               }
> +               Py_DECREF(key);
> +       }
> +       Py_DECREF(emptyTup);
> +       return ret;
> + nomem:
> +       PyErr_NoMemory();
> +       Py_XDECREF(ret);
> +       Py_XDECREF(emptyTup);
> +       return NULL;
> +}
> +
> +static PyMethodDef lazymanifest_methods[] = {
> +       {"copy", (PyCFunction)lazymanifest_copy, METH_NOARGS,
> +        "Make a copy of this lazymanifest."},
> +       {"filtercopy", (PyCFunction)lazymanifest_filtercopy, METH_O,
> +        "Make a copy of this manifest filtered by matchfn."},
> +       {"diff", (PyCFunction)lazymanifest_diff, METH_VARARGS,
> +        "Compare this lazymanifest to another one."},
> +       {"text", (PyCFunction)lazymanifest_text, METH_NOARGS,
> +        "Encode this manifest to text."},
> +       {NULL},
> +};
> +
> +static PyTypeObject lazymanifestType = {
> +       PyObject_HEAD_INIT(NULL)
> +       0,                                                /* ob_size */
> +       "parsers.lazymanifest",                           /* tp_name */
> +       sizeof(lazymanifest),                             /* tp_basicsize
> */
> +       0,                                                /* tp_itemsize */
> +       (destructor)lazymanifest_dealloc,                 /* tp_dealloc */
> +       0,                                                /* tp_print */
> +       0,                                                /* tp_getattr */
> +       0,                                                /* tp_setattr */
> +       0,                                                /* tp_compare */
> +       0,                                                /* tp_repr */
> +       0,                                                /* tp_as_number
> */
> +       &lazymanifest_seq_meths,                          /*
> tp_as_sequence */
> +       &lazymanifest_mapping_methods,                    /*
> tp_as_mapping */
> +       0,                                                /* tp_hash */
> +       0,                                                /* tp_call */
> +       0,                                                /* tp_str */
> +       0,                                                /* tp_getattro */
> +       0,                                                /* tp_setattro */
> +       0,                                                /* tp_as_buffer
> */
> +       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_SEQUENCE_IN, /* tp_flags */
> +       "TODO(augie)",                                    /* tp_doc */
> +       0,                                                /* tp_traverse */
> +       0,                                                /* tp_clear */
> +       0,                                                /*
> tp_richcompare */
> +       0,                                             /*
> tp_weaklistoffset */
> +       (getiterfunc)lazymanifest_getiter,                /* tp_iter */
> +       0,                                                /* tp_iternext */
> +       lazymanifest_methods,                             /* tp_methods */
> +       0,                                                /* tp_members */
> +       0,                                                /* tp_getset */
> +       0,                                                /* tp_base */
> +       0,                                                /* tp_dict */
> +       0,                                                /* tp_descr_get
> */
> +       0,                                                /* tp_descr_set
> */
> +       0,                                                /* tp_dictoffset
> */
> +       (initproc)lazymanifest_init,                      /* tp_init */
> +       0,                                                /* tp_alloc */
> +};
> +
> +void manifest_module_init(PyObject * mod)
> +{
> +       lazymanifestType.tp_new = PyType_GenericNew;
> +       if (PyType_Ready(&lazymanifestType) < 0)
> +               return;
> +       Py_INCREF(&lazymanifestType);
> +
> +       PyModule_AddObject(mod, "lazymanifest",
> +                          (PyObject *)&lazymanifestType);
> +}
> diff --git a/mercurial/parsers.c b/mercurial/parsers.c
> --- a/mercurial/parsers.c
> +++ b/mercurial/parsers.c
> @@ -71,7 +71,7 @@ static inline int hexdigit(const char *p
>  /*
>   * Turn a hex-encoded string into binary.
>   */
> -static PyObject *unhexlify(const char *str, int len)
> +PyObject *unhexlify(const char *str, int len)
>  {
>         PyObject *ret;
>         char *d;
> @@ -2233,6 +2233,7 @@ static PyMethodDef methods[] = {
>  };
>
>  void dirs_module_init(PyObject *mod);
> +void manifest_module_init(PyObject *mod);
>
>  static void module_init(PyObject *mod)
>  {
> @@ -2247,6 +2248,7 @@ static void module_init(PyObject *mod)
>         PyModule_AddStringConstant(mod, "versionerrortext",
> versionerrortext);
>
>         dirs_module_init(mod);
> +       manifest_module_init(mod);
>
>         indexType.tp_new = PyType_GenericNew;
>         if (PyType_Ready(&indexType) < 0 ||
> diff --git a/setup.py b/setup.py
> --- a/setup.py
> +++ b/setup.py
> @@ -491,6 +491,7 @@ extmodules = [
>      Extension('mercurial.mpatch', ['mercurial/mpatch.c'],
>                depends=common_depends),
>      Extension('mercurial.parsers', ['mercurial/dirs.c',
> +                                    'mercurial/manifest.c',
>                                      'mercurial/parsers.c',
>                                      'mercurial/pathencode.c'],
>                depends=common_depends),
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-manifest.py
> @@ -0,0 +1,209 @@
> +import binascii
> +import unittest
> +import itertools
> +
> +import silenttestrunner
> +
> +from mercurial import parsers
> +
> +HASH_1 = '1' * 40
> +HASH_2 = 'f' * 40
> +HASH_3 = '1234567890abcdef0987654321deadbeef0fcafe'
> +A_SHORT_MANIFEST = (
> +    'bar/baz/qux.py\0%(hash2)s%(flag2)s\n'
> +    'foo\0%(hash1)s%(flag1)s\n'
> +    ) % {'hash1': HASH_1,
> +         'flag1': '',
> +         'hash2': HASH_2,
> +         'flag2': 'l',
> +         }
> +
> +HUGE_MANIFEST_ENTRIES = 200001
> +
> +A_HUGE_MANIFEST = ''.join(sorted(
> +    'file%d\0%s%s\n' % (i, h, f) for i, h, f in
> +    itertools.izip(xrange(200001),
> +                   itertools.cycle((HASH_1, HASH_2)),
> +                   itertools.cycle(('', 'x', 'l')))))
> +
> +class testmanifest(unittest.TestCase):
> +    def testEmptyManifest(self):
> +        m = parsers.lazymanifest('')
> +        self.assertEqual(0, len(m))
> +        self.assertEqual([], list(m))
> +
> +    def testManifest(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        want = [
> +            ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
> +            ('foo', binascii.unhexlify(HASH_1), ''),
> +            ]
> +        self.assertEqual(len(want), len(m))
> +        self.assertEqual(want, list(m))
> +        self.assertEqual((binascii.unhexlify(HASH_1), ''), m['foo'])
> +        self.assertRaises(KeyError, lambda : m['wat'])
> +        self.assertEqual((binascii.unhexlify(HASH_2), 'l'),
> +                         m['bar/baz/qux.py'])
> +
> +    def testSetItem(self):
> +        want = binascii.unhexlify(HASH_1), ''
> +
> +        m = parsers.lazymanifest('')
> +        m['a'] = want
> +        self.assertIn('a', m)
> +        self.assertEqual(want, m['a'])
> +        self.assertEqual('a\0' + HASH_1 + '\n', m.text())
> +
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m['a'] = want
> +        self.assertEqual(want, m['a'])
> +        self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
> +                         m.text())
> +        m2 = m.copy()
> +        del m
> +        del m2 # make sure we don't double free() anything
> +
> +    def testCompaction(self):
> +        unhex = binascii.unhexlify
> +        h1, h2 = unhex(HASH_1), unhex(HASH_2)
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m['alpha'] = h1, ''
> +        m['beta'] = h2, ''
> +        del m['foo']
> +        want = 'alpha\0%s\nbar/baz/qux.py\0%sl\nbeta\0%s\n' % (
> +            HASH_1, HASH_2, HASH_2)
> +        self.assertEqual(want, m.text())
> +        self.assertEqual(3, len(m))
> +        self.assertEqual((h1, ''), m['alpha'])
> +        self.assertEqual((h2, ''), m['beta'])
> +        self.assertRaises(KeyError, lambda : m['foo'])
> +        w = [('alpha', h1, ''), ('bar/baz/qux.py', h2, 'l'), ('beta', h2,
> '')]
> +        self.assertEqual(w, list(m))
> +
> +    def testSetGetNodeSuffix(self):
> +        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        h, f = m['foo']
> +        want = h + 'a', f
> +        # Merge code wants to set 21-byte fake hashes at times
> +        m['foo'] = want
> +        self.assertEqual(want, m['foo'])
> +        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2),
> 'l'),
> +                          ('foo', binascii.unhexlify(HASH_1) + 'a', '')],
> +                         list(m))
> +        # Sometimes it even tries a 22-byte fake hash, but we can
> +        # return 21 and it'll work out
> +        m['foo'] = want[0] + '+', f
> +        self.assertEqual(want, m['foo'])
> +        # make sure the suffix survives a copy
> +        m2 = m.filtercopy(lambda x: x == 'foo')
> +        self.assertEqual(want, m2['foo'])
> +        m2 = m.copy()
> +        self.assertEqual(want, m2['foo'])
> +        # suffix with iteration
> +        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2),
> 'l'),
> +                          ('foo', want[0], '')], list(m))
> +        # shows up in diff
> +        self.assertEqual({'foo': (want, (h, ''))}, m.diff(clean))
> +        self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
> +
> +    def testFilterCopyException(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        def filt(path):
> +            if path == 'foo':
> +                assert False
> +            return True
> +        self.assertRaises(AssertionError, m.filtercopy, filt)
> +
> +    def testRemoveItem(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        del m['foo']
> +        self.assertRaises(KeyError, lambda : m['foo'])
> +        self.assertEqual(1, len(m))
> +        self.assertEqual(1, len(list(m)))
> +
> +    def testManifestDiff(self):
> +        MISSING = (None, '')
> +        addl = 'z-only-in-left\0' + HASH_1 + '\n'
> +        addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
> +        left = parsers.lazymanifest(
> +            A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
> +        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
> +        want = {
> +            'foo': ((binascii.unhexlify(HASH_3), 'x'),
> +                    (binascii.unhexlify(HASH_1), '')),
> +            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            'z-only-in-right': (MISSING, (binascii.unhexlify(HASH_2),
> 'x')),
> +            }
> +        self.assertEqual(want, left.diff(right))
> +
> +        want = {
> +            'bar/baz/qux.py': (MISSING, (binascii.unhexlify(HASH_2),
> 'l')),
> +            'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
> +            'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, parsers.lazymanifest('').diff(left))
> +
> +        want = {
> +            'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'),
> MISSING),
> +            'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
> +            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            }
> +        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
> +        copy = right.copy()
> +        del copy['z-only-in-right']
> +        del right['foo']
> +        want = {
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            'z-only-in-right': ((binascii.unhexlify(HASH_2), 'x'),
> MISSING),
> +            }
> +        self.assertEqual(want, right.diff(copy))
> +
> +        short = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        pruned = short.copy()
> +        del pruned['foo']
> +        want = {
> +            'foo': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            }
> +        self.assertEqual(want, short.diff(pruned))
> +        want = {
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, pruned.diff(short))
> +        want = {
> +            'bar/baz/qux.py': None,
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, pruned.diff(short, True))
> +
> +    def testReversedLines(self):
> +        backwards = ''.join(
> +            l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if
> l)
> +        try:
> +            parsers.lazymanifest(backwards)
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertEqual('Manifest lines not in sorted order.',
> v.message)
> +
> +    def testNoTerminalNewline(self):
> +        try:
> +            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertEqual('Manifest did not end in a newline.',
> v.message)
> +
> +    def testNoNewLineAtAll(self):
> +        try:
> +            parsers.lazymanifest('wat')
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertEqual('Manifest did not end in a newline.',
> v.message)
> +
> +    def testHugeManifest(self):
> +        m = parsers.lazymanifest(A_HUGE_MANIFEST)
> +        self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
> +        self.assertEqual(len(m), len(list(m)))
> +
> +
> +if __name__ == '__main__':
> +    silenttestrunner.main(__name__)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>
Martin von Zweigbergk - Jan. 13, 2015, 6:27 a.m.
On Mon Jan 12 2015 at 12:05:17 PM Augie Fackler <raf@durin42.com> wrote:

> +static PyObject *lmiter_iternext(PyObject *o)
> +{
>
 ...

> +       ret = PyTuple_Pack(3, path, hash, flags);
>

Since we only iterate over the files (called from manifest.keys()), should
we define the files to be the items iterated over instead? Just like how
dict does it, I think. If we ever want to get the path, hash and flags
together, we could add a .items() then. What do you think? For after freeze?
Martin von Zweigbergk - Jan. 14, 2015, 12:42 a.m.
As I reported on IRC, the 'setitem' method seems slow. On the Mozilla repo,
running

  hg co -C 1813b && hg mv -q intl i18n && time hg ci -qm 'move intl'

takes 6.3s with current hg and 1m21s with these patches applied. It may
very be related to your TODO in setitem.

On Mon Jan 12 2015 at 12:05:17 PM Augie Fackler <raf@durin42.com> wrote:

> # HG changeset patch
> # User Augie Fackler <augie@google.com>
> # Date 1420871231 28800
> #      Fri Jan 09 22:27:11 2015 -0800
> # Node ID b9d67bf3ea8196c8530afdcf6b1727682b681259
> # Parent  678f53865c6860a950392691814766957ee89316
> manifest.c: new extension code to lazily parse manifests
>
> This lets us iterate manifests in order, but do a _lot_ less work in
> the common case when we only care about a few manifest entries.
>
> Many thanks to Mike Edgar for reviewing this in advance of it going
> out to the list, which caught many things I missed.
>
> This version of the patch includes C89 fixes from Sean Farley and
> many correctness/efficiency cleanups from Martin von
> Zweigbergk. Thanks to both!
>
> diff --git a/mercurial/manifest.c b/mercurial/manifest.c
> new file mode 100644
> --- /dev/null
> +++ b/mercurial/manifest.c
> @@ -0,0 +1,865 @@
> +/*
> + * manifest.c - manifest type that does on-demand parsing.
> + *
> + * Copyright 2015, Google Inc.
> + *
> + * This software may be used and distributed according to the terms of
> + * the GNU General Public License, incorporated herein by reference.
> + */
> +#include <assert.h>
> +#include <string.h>
> +#include <stdlib.h>
> +
> +#include <Python.h>
> +
> +/* VC9 doesn't include bool and lacks stdbool.h based on my searching */
> +#ifdef _MSC_VER
> +#define true 1
> +#define false 0
> +typedef unsigned char bool;
> +#else
> +#include <stdbool.h>
> +#endif
> +
> +#define DEFAULT_LINES 100000
> +
> +typedef struct {
> +       char *start;
> +       Py_ssize_t len; /* length of line not including terminal newline */
> +       char hash_suffix;
> +       bool from_malloc;
> +       bool deleted;
> +} line;
> +
> +typedef struct {
> +       PyObject_HEAD
> +       PyObject *pydata;
> +       line *lines;
> +       int numlines; /* number of line entries */
> +       int livelines; /* number of non-deleted lines */
> +       int maxlines; /* allocated number of lines */
> +       bool dirty;
> +} lazymanifest;
> +
> +#define MANIFEST_OOM -1
> +#define MANIFEST_NOT_SORTED -2
> +#define MANIFEST_MALFORMED -3
> +
> +/* defined in parsers.c */
> +PyObject *unhexlify(const char *str, int len);
> +
> +/* get the length of the path for a line */
> +static size_t pathlen(line *l) {
> +       return strnlen(l->start, l->len);
> +}
> +
> +/* get the node value of a single line */
> +static PyObject *nodeof(line *l) {
> +       char *s = l->start;
> +       ssize_t llen = pathlen(l);
> +       PyObject *hash = unhexlify(s + llen + 1, 40);
> +       if (!hash) {
> +               return NULL;
> +       }
> +       if (l->hash_suffix != '\0') {
> +               char newhash[21];
> +               memcpy(newhash, PyString_AsString(hash), 20);
> +               Py_DECREF(hash);
> +               newhash[20] = l->hash_suffix;
> +               hash = PyString_FromStringAndSize(newhash, 21);
> +       }
> +       return hash;
> +}
> +
> +/* get the node hash and flags of a line as a tuple */
> +static PyObject *hashflags(line *l)
> +{
> +       char *s = l->start;
> +       size_t plen = pathlen(l);
> +       PyObject *hash = nodeof(l);
> +
> +       /* 40 for hash, 1 for null byte, 1 for newline */
> +       size_t hplen = plen + 42;
> +       Py_ssize_t flen = l->len - hplen;
> +       PyObject *flags;
> +       PyObject *tup;
> +
> +       if (!hash)
> +               return NULL;
> +       if (flen > 0) {
> +               flags = PyString_FromStringAndSize(s + hplen - 1, flen);
> +       } else {
> +               flags = PyString_FromString("");
> +       }
> +       if (!flags) {
> +               Py_DECREF(hash);
> +               return NULL;
> +       }
> +       tup = PyTuple_Pack(2, hash, flags);
> +       Py_DECREF(flags);
> +       Py_DECREF(hash);
> +       return tup;
> +}
> +
> +/* if we're about to run out of space in the line index, add more */
> +static bool realloc_if_full(lazymanifest *self)
> +{
> +       if (self->numlines == self->maxlines) {
> +               self->maxlines *= 2;
> +               self->lines = realloc(self->lines, self->maxlines *
> sizeof(line));
> +       }
> +       return self->lines;
> +}
> +
> +/*
> + * Find the line boundaries in the manifest that 'data' points to and
> store
> + * information about each line in 'self'.
> + */
> +static int find_lines(lazymanifest *self, char *data, Py_ssize_t len)
> +{
> +       char *prev = NULL;
> +       line *l = NULL;
> +       while (len > 0) {
> +               char *next = memchr(data, '\n', len);
> +               if (!next) {
> +                       return MANIFEST_MALFORMED;
> +               }
> +               next++; /* advance past newline */
> +               if (!realloc_if_full(self)) {
> +                       return MANIFEST_OOM; /* no memory */
> +               }
> +               if (prev && strcmp(prev, data) > -1) {
> +                       /* This data isn't sorted, so we have to abort. */
> +                       return MANIFEST_NOT_SORTED;
> +               }
> +               l = self->lines + ((self->numlines)++);
> +               l->start = data;
> +               l->len = next - data;
> +               l->hash_suffix = '\0';
> +               l->from_malloc = false;
> +               l->deleted = false;
> +               len = len - l->len;
> +               prev = data;
> +               data = next;
> +       }
> +       self->livelines = self->numlines;
> +       return 0;
> +}
> +
> +static int lazymanifest_init(lazymanifest *self, PyObject *args)
> +{
> +       char *data;
> +       Py_ssize_t len;
> +       int err, ret;
> +       PyObject *pydata;
> +       if (!PyArg_ParseTuple(args, "S", &pydata)) {
> +               return -1;
> +       }
> +       err = PyString_AsStringAndSize(pydata, &data, &len);
> +
> +       self->dirty = false;
> +       if (err == -1)
> +               return -1;
> +       self->pydata = pydata;
> +       Py_INCREF(self->pydata);
> +       Py_BEGIN_ALLOW_THREADS
> +       self->lines = malloc(DEFAULT_LINES * sizeof(line));
> +       self->maxlines = DEFAULT_LINES;
> +       self->numlines = 0;
> +       if (!self->lines)
> +               ret = -2;
> +       else
> +               ret = find_lines(self, data, len);
> +       Py_END_ALLOW_THREADS
> +       switch (ret) {
> +       case 0:
> +               break;
> +       case MANIFEST_OOM:
> +               PyErr_NoMemory();
> +               break;
> +       case MANIFEST_NOT_SORTED:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Manifest lines not in sorted order.");
> +               break;
> +       case MANIFEST_MALFORMED:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Manifest did not end in a newline.");
> +               break;
> +       default:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Unknown problem parsing manifest.");
> +       }
> +       return ret == 0 ? 0 : -1;
> +}
> +
> +static void lazymanifest_dealloc(lazymanifest *self)
> +{
> +       /* free any extra lines we had to allocate */
> +       int i;
> +       for (i = 0; i < self->numlines; i++) {
> +               if (self->lines[i].from_malloc) {
> +                       free(self->lines[i].start);
> +               }
> +       }
> +       if (self->lines) {
> +               free(self->lines);
> +               self->lines = NULL;
> +       }
> +       if (self->pydata) {
> +               Py_DECREF(self->pydata);
> +               self->pydata = NULL;
> +       }
> +       PyObject_Del(self);
> +}
> +
> +/* iteration support */
> +
> +typedef struct {
> +       PyObject_HEAD lazymanifest *m;
> +       Py_ssize_t pos;
> +} lmIter;
> +
> +static void lmiter_dealloc(PyObject *o)
> +{
> +       lmIter *self = (lmIter *)o;
> +       Py_DECREF(self->m);
> +       PyObject_Del(self);
> +}
> +
> +static PyObject *lmiter_iternext(PyObject *o)
> +{
> +       size_t pl;
> +       line *l;
> +       Py_ssize_t consumed;
> +       PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL;
> +       lmIter *self = (lmIter *)o;
> +       Py_ssize_t index = self->pos++;
> +       /* skip over deleted manifest entries */
> +       while (index < self->m->numlines &&
> +              self->m->lines[index].deleted) {
> +               index = ++(self->pos);
> +       }
> +       if (self->m->numlines <= index) {
> +               goto bail;
> +       }
> +       l = &(self->m->lines[index]);
> +       pl = pathlen(l);
> +       path = PyString_FromStringAndSize(l->start, pl);
> +       hash = nodeof(l);
> +       consumed = pl + 41;
> +       if (l->len > (consumed + 1)) {
> +               flags = PyString_FromStringAndSize(l->start + consumed,
> +                                                  l->len - consumed - 1);
> +       } else {
> +               flags = PyString_FromString("");
> +
> +       }
> +       if (!flags) {
> +               goto bail;
> +       }
> +       ret = PyTuple_Pack(3, path, hash, flags);
> + bail:
> +       Py_XDECREF(path);
> +       Py_XDECREF(hash);
> +       Py_XDECREF(flags);
> +       return ret;
> +}
> +
> +static PyTypeObject lazymanifestIterator = {
> +       PyObject_HEAD_INIT(NULL)
> +       0,                               /*ob_size */
> +       "parsers.lazymanifest.iterator", /*tp_name */
> +       sizeof(lmIter),                  /*tp_basicsize */
> +       0,                               /*tp_itemsize */
> +       lmiter_dealloc,                  /*tp_dealloc */
> +       0,                               /*tp_print */
> +       0,                               /*tp_getattr */
> +       0,                               /*tp_setattr */
> +       0,                               /*tp_compare */
> +       0,                               /*tp_repr */
> +       0,                               /*tp_as_number */
> +       0,                               /*tp_as_sequence */
> +       0,                               /*tp_as_mapping */
> +       0,                               /*tp_hash */
> +       0,                               /*tp_call */
> +       0,                               /*tp_str */
> +       0,                               /*tp_getattro */
> +       0,                               /*tp_setattro */
> +       0,                               /*tp_as_buffer */
> +       /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
> +          use tp_iter and tp_iternext fields. */
> +       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
> +       "Iterator for a lazymanifest.",  /* tp_doc */
> +       0,                               /* tp_traverse */
> +       0,                               /* tp_clear */
> +       0,                               /* tp_richcompare */
> +       0,                               /* tp_weaklistoffset */
> +       PyObject_SelfIter,               /* tp_iter: __iter__() method */
> +       lmiter_iternext,                 /* tp_iternext: next() method */
> +};
> +
> +static lazymanifest *lazymanifest_copy(
> +       lazymanifest *self, PyObject *unused_args);
> +
> +static PyObject *lazymanifest_getiter(lazymanifest *self)
> +{
> +       lmIter *i = NULL;
> +       lazymanifest *t = lazymanifest_copy(self, NULL);
> +       if (!t) {
> +               PyErr_NoMemory();
> +               return NULL;
> +       }
> +       i = PyObject_New(lmIter, &lazymanifestIterator);
> +       if (i) {
> +               i->m = t;
> +               i->pos = 0;
> +       } else {
> +               Py_DECREF(t);
> +               PyErr_NoMemory();
> +       }
> +       return (PyObject *)i;
> +}
> +
> +/* __getitem__ and __setitem__ support */
> +
> +static Py_ssize_t lazymanifest_size(lazymanifest *self)
> +{
> +       return self->livelines;
> +}
> +
> +static int linecmp(const void *left, const void *right)
> +{
> +       return strcmp(((const line *)left)->start,
> +                     ((const line *)right)->start);
> +}
> +
> +static PyObject *lazymanifest_getitem(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "getitem: manifest keys must be a string.");
> +               return NULL;
> +       }
> +       needle.start = PyString_AsString(key);
> +       needle.len = 0;
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               PyErr_Format(PyExc_KeyError, "No such manifest entry.");
> +               return NULL;
> +       }
> +       return hashflags(hit);
> +}
> +
> +static int lazymanifest_delitem(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "delitem: manifest keys must be a string.");
> +               return -1;
> +       }
> +       needle.start = PyString_AsString(key);
> +       needle.len = 0;
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               PyErr_Format(PyExc_KeyError,
> +                            "Tried to delete nonexistent manifest
> entry.");
> +               return -1;
> +       }
> +       self->dirty = true;
> +       hit->deleted = true;
> +       self->livelines--;
> +       return 0;
> +}
> +
> +static int lazymanifest_setitem(
> +       lazymanifest *self, PyObject *key, PyObject *value)
> +{
> +       char *path;
> +       Py_ssize_t plen;
> +       PyObject *pyhash;
> +       Py_ssize_t hlen;
> +       char *hash;
> +       char hashextra;
> +       PyObject *pyflags;
> +       char *flags;
> +       Py_ssize_t flen;
> +       size_t dlen;
> +       char *dest;
> +       int i;
> +       line new;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "setitem: manifest keys must be a string.");
> +               return -1;
> +       }
> +       if (!value) {
> +               return lazymanifest_delitem(self, key);
> +       }
> +       if (!PyTuple_Check(value) || PyTuple_Size(value) != 2) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "Manifest values must be a tuple of (node,
> flags).");
> +               return -1;
> +       }
> +       if (PyString_AsStringAndSize(key, &path, &plen) == -1) {
> +               return -1;
> +       }
> +
> +       pyhash = PyTuple_GetItem(value, 0);
> +       if (!PyString_Check(pyhash)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "node must be a 20-byte string");
> +               return -1;
> +       }
> +       hlen = PyString_Size(pyhash);
> +       /* Some parts of the codebase try and set 21 or 22
> +        * byte "hash" values in order to perturb things for
> +        * status. We have to preserve at least the 21st
> +        * byte. Sigh. If there's a 22nd byte, we drop it on
> +        * the floor, which works fine.
> +        */
> +       if (hlen != 20 && hlen != 21 && hlen != 22) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "node must be a 20-byte string");
> +               return -1;
> +       }
> +       hash = PyString_AsString(pyhash);
> +       hashextra = '\0';
> +       if (hlen > 20) {
> +               hashextra = hash[20];
> +       }
> +
> +       pyflags = PyTuple_GetItem(value, 1);
> +       if (!PyString_Check(pyflags) || PyString_Size(pyflags) > 1) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "flags must a 0 or 1 byte string");
> +               return -1;
> +       }
> +       if (PyString_AsStringAndSize(pyflags, &flags, &flen) == -1) {
> +               return -1;
> +       }
> +       /* 3 -> two null bytes and one newline */
> +       dlen = plen + 40 + flen + 3;
> +       dest = malloc(dlen);
> +       if (!dest) {
> +               PyErr_NoMemory();
> +               return -1;
> +       }
> +       memcpy(dest, path, plen + 1);
> +       for (i = 0; i < 20; i++) {
> +               sprintf(dest + plen + 1 + (i * 2), "%02hhx", hash[i]);
> +       }
> +       sprintf(dest + plen + 41, "%s\n", flags);
> +       new.start = dest;
> +       new.len = dlen - 1;
> +       new.hash_suffix = hashextra;
> +       new.from_malloc = true;     /* is `start` a pointer we allocated?
> */
> +       new.deleted = false;        /* is this entry deleted? */
> +       hit = bsearch(&new, self->lines, self->numlines,
> +                     sizeof(line), &linecmp);
> +       self->dirty = true;
> +       if (hit) {
> +               /* updating a line we already had */
> +               if (hit->from_malloc) {
> +                       free(hit->start);
> +               }
> +               if (hit->deleted) {
> +                       self->livelines++;
> +               }
> +               *hit = new;
> +       } else {
> +               /* new line */
> +               if (!realloc_if_full(self)) {
> +                       PyErr_NoMemory();
> +                       return -1;
> +               }
> +               self->lines[self->numlines++] = new;
> +               self->livelines++;
> +               /* TODO: do a binary search and insert rather than
> +                * append and qsort. Also offer a batch-insert
> +                * interface, which should be a nice little
> +                * performance win.
> +                */
> +               qsort(self->lines, self->numlines, sizeof(line), &linecmp);
> +       }
> +       return 0;
> +}
> +
> +static PyMappingMethods lazymanifest_mapping_methods = {
> +       (lenfunc)lazymanifest_size,             /* mp_length */
> +       (binaryfunc)lazymanifest_getitem,       /* mp_subscript */
> +       (objobjargproc)lazymanifest_setitem,    /* mp_ass_subscript */
> +};
> +
> +/* sequence methods (important or __contains__ builds an iterator */
> +
> +static int lazymanifest_contains(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "contains: manifest keys must be a string.");
> +               return 0;
> +       }
> +       needle.start = PyString_AsString(key);
> +       needle.len = 0;
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               return 0;
> +       }
> +       return 1;
> +}
> +
> +static PySequenceMethods lazymanifest_seq_meths = {
> +       (lenfunc)lazymanifest_size, /* sq_length */
> +       0, /* sq_concat */
> +       0, /* sq_repeat */
> +       0, /* sq_item */
> +       0, /* sq_slice */
> +       0, /* sq_ass_item */
> +       0, /* sq_ass_slice */
> +       (objobjproc)lazymanifest_contains, /* sq_contains */
> +       0, /* sq_inplace_concat */
> +       0, /* sq_inplace_repeat */
> +};
> +
> +
> +/* Other methods (copy, diff, etc) */
> +static PyTypeObject lazymanifestType;
> +
> +/* If the manifest has changes, build the new manifest text and reindex
> it. */
> +static int compact(lazymanifest *self) {
> +       int i;
> +       ssize_t need = 0;
> +       char *data;
> +       ssize_t copied = 0;
> +       int real = 0;
> +       line *src, *dst;
> +       char *tofree;
> +       ssize_t c;
> +       PyObject *pydata;
> +       char *realdest;
> +       int err;
> +       Py_ssize_t offset;
> +       line new;
> +       if (!self->dirty)
> +               return 0;
> +       for (i = 0; i < self->numlines; i++) {
> +               if (!self->lines[i].deleted) {
> +                       need += self->lines[i].len;
> +               }
> +       }
> +       pydata = PyString_FromStringAndSize(NULL, need);
> +       if (!pydata)
> +               return -1;
> +       data = PyString_AsString(pydata);
> +       if (!data) {
> +               return -1;
> +       }
> +       src = self->lines;
> +       dst = self->lines;
> +       for (i = 0; i < self->numlines; i++, src++) {
> +               tofree = NULL;
> +               if (src->from_malloc) {
> +                       tofree = src->start;
> +               }
> +               if (!src->deleted) {
> +                       memcpy(data, src->start, src->len);
> +                       *dst = *src;
> +                       dst->start = data;
> +                       dst->from_malloc = false;
> +                       data += dst->len;
> +                       dst++;
> +               }
> +               free(tofree);
> +       }
> +       Py_DECREF(self->pydata);
> +       self->pydata = pydata;
> +       self->numlines = self->livelines;
> +       self->dirty = false;
> +       return 0;
> +}
> +
> +static PyObject *lazymanifest_text(lazymanifest *self, PyObject
> *unused_args)
> +{
> +       if (compact(self) != 0) {
> +               PyErr_NoMemory();
> +               return NULL;
> +       }
> +       Py_INCREF(self->pydata);
> +       return self->pydata;
> +}
> +
> +static lazymanifest *lazymanifest_copy(
> +       lazymanifest *self, PyObject *unused_args)
> +{
> +       lazymanifest *copy = NULL;
> +       if (compact(self) != 0) {
> +               goto nomem;
> +       }
> +       copy = PyObject_New(lazymanifest, &lazymanifestType);
> +       if (!copy) {
> +               goto nomem;
> +       }
> +       copy->numlines = self->numlines;
> +       copy->livelines = self->livelines;
> +       copy->dirty = false;
> +       copy->lines = malloc(self->maxlines *sizeof(line));
> +       if (!copy->lines) {
> +               goto nomem;
> +       }
> +       memcpy(copy->lines, self->lines, self->numlines * sizeof(line));
> +       copy->maxlines = self->maxlines;
> +       copy->pydata = self->pydata;
> +       Py_INCREF(copy->pydata);
> +       return copy;
> + nomem:
> +       PyErr_NoMemory();
> +       Py_XDECREF(copy);
> +       return NULL;
> +}
> +
> +static lazymanifest *lazymanifest_filtercopy(
> +       lazymanifest *self, PyObject *matchfn)
> +{
> +       lazymanifest *tmp;
> +       PyObject *arg;
> +       PyObject *arglist;
> +       PyObject *result;
> +       int i;
> +       if (!PyCallable_Check(matchfn)) {
> +               PyErr_SetString(PyExc_TypeError, "matchfn must be
> callable");
> +               return NULL;
> +       }
> +       /* compact ourselves first to avoid double-frees later when we
> +        * compact tmp so that it doesn't have random pointers to our
> +        * underlying data */
> +       compact(self);
> +       tmp = PyObject_New(lazymanifest, &lazymanifestType);
> +       tmp->lines = malloc(self->maxlines * sizeof(line));
> +       tmp->maxlines = self->maxlines;
> +       tmp->livelines = 0;
> +       tmp->numlines = 0;
> +       tmp->pydata = self->pydata;
> +       Py_INCREF(self->pydata);
> +       for (i = 0; i < self->numlines; i++) {
> +               arg = PyString_FromString(self->lines[i].start);
> +               arglist = PyTuple_Pack(1, arg);
> +               result = PyObject_CallObject(matchfn, arglist);
> +               /* if the callback raised an exception, just let it
> +                * through and give up */
> +               if (!result) {
> +                       free(tmp->lines);
> +                       tmp->lines = NULL;
> +                       Py_DECREF(self->pydata);
> +                       return NULL;
> +               }
> +               if (PyObject_IsTrue(result)) {
> +                       assert(!(self->lines[i].from_malloc));
> +                       tmp->lines[tmp->numlines++] = self->lines[i];
> +                       tmp->livelines++;
> +               }
> +               Py_DECREF(arglist);
> +               Py_DECREF(arg);
> +               Py_DECREF(result);
> +       }
> +       compact(tmp);
> +       return tmp;
> +}
> +
> +static PyObject *lazymanifest_diff(lazymanifest *self, PyObject *args)
> +{
> +       lazymanifest *other;
> +       PyObject *pyclean = NULL;
> +       bool clean;
> +       PyObject *emptyTup = NULL, *ret = NULL;
> +       PyObject *es;
> +       int sneedle = 0, oneedle = 0;
> +       line *left;
> +       line *right;
> +       int result;
> +       PyObject *key;
> +       PyObject *tup;
> +       PyObject *outer;
> +       PyObject *r;
> +       if (!PyArg_ParseTuple(args, "O!|O", &lazymanifestType, &other,
> &pyclean)) {
> +               return NULL;
> +       }
> +       clean = (!pyclean) ? false : PyObject_IsTrue(pyclean);
> +       es = PyString_FromString("");
> +       if (!es) {
> +               goto nomem;
> +       }
> +       emptyTup = PyTuple_Pack(2, Py_None, es);
> +       Py_DECREF(es);
> +       if (!emptyTup) {
> +               goto nomem;
> +       }
> +       ret = PyDict_New();
> +       if (!ret) {
> +               goto nomem;
> +       }
> +       while (sneedle != self->numlines || oneedle != other->numlines) {
> +               left = self->lines + sneedle;
> +               right = other->lines + oneedle;
> +               /* If we're looking at a deleted entry and it's not
> +                * the end of the manifest, just skip it. */
> +               if (left->deleted && sneedle < self->numlines) {
> +                       sneedle++;
> +                       continue;
> +               }
> +               if (right->deleted && oneedle < other->numlines) {
> +                       oneedle++;
> +                       continue;
> +               }
> +               /* if we're at the end of either manifest, then we
> +                * know the remaining items are adds so we can skip
> +                * the strcmp. */
> +               if (sneedle == self->numlines) {
> +                       result = 1;
> +               } else if (oneedle == other->numlines) {
> +                       result = -1;
> +               } else {
> +                       result = linecmp(left, right);
> +               }
> +               key = result <= 0 ?
> +                       PyString_FromString(left->start) :
> +                       PyString_FromString(right->start);
> +               if (!key)
> +                       goto nomem;
> +               if (result < 0) {
> +                       tup = hashflags(left);
> +                       if (!tup) {
> +                               goto nomem;
> +                       }
> +                       outer = PyTuple_Pack(2, tup, emptyTup);
> +                       Py_DECREF(tup);
> +                       if (!outer) {
> +                               goto nomem;
> +                       }
> +                       PyDict_SetItem(ret, key, outer);
> +                       Py_DECREF(outer);
> +                       sneedle++;
> +               } else if (result > 0) {
> +                       tup = hashflags(right);
> +                       if (!tup) {
> +                               goto nomem;
> +                       }
> +                       outer = PyTuple_Pack(2, emptyTup, tup);
> +                       Py_DECREF(tup);
> +                       if (!outer) {
> +                               goto nomem;
> +                       }
> +                       PyDict_SetItem(ret, key, outer);
> +                       Py_DECREF(outer);
> +                       oneedle++;
> +               } else {
> +                       /* file exists in both manifests */
> +                       if (left->len != right->len
> +                           || memcmp(left->start, right->start, left->len)
> +                           || left->hash_suffix != right->hash_suffix) {
> +                               PyObject *l = hashflags(left);
> +                               if (!l) {
> +                                       goto nomem;
> +                               }
> +                               r = hashflags(right);
> +                               if (!r) {
> +                                       Py_DECREF(l);
> +                                       goto nomem;
> +                               }
> +                               outer = PyTuple_Pack(2, l, r);
> +                               Py_DECREF(l);
> +                               Py_DECREF(r);
> +                               if (!outer) {
> +                                       goto nomem;
> +                               }
> +                               PyDict_SetItem(ret, key, outer);
> +                               Py_DECREF(outer);
> +                       } else if (clean) {
> +                               PyDict_SetItem(ret, key, Py_None);
> +                       }
> +                       sneedle++;
> +                       oneedle++;
> +               }
> +               Py_DECREF(key);
> +       }
> +       Py_DECREF(emptyTup);
> +       return ret;
> + nomem:
> +       PyErr_NoMemory();
> +       Py_XDECREF(ret);
> +       Py_XDECREF(emptyTup);
> +       return NULL;
> +}
> +
> +static PyMethodDef lazymanifest_methods[] = {
> +       {"copy", (PyCFunction)lazymanifest_copy, METH_NOARGS,
> +        "Make a copy of this lazymanifest."},
> +       {"filtercopy", (PyCFunction)lazymanifest_filtercopy, METH_O,
> +        "Make a copy of this manifest filtered by matchfn."},
> +       {"diff", (PyCFunction)lazymanifest_diff, METH_VARARGS,
> +        "Compare this lazymanifest to another one."},
> +       {"text", (PyCFunction)lazymanifest_text, METH_NOARGS,
> +        "Encode this manifest to text."},
> +       {NULL},
> +};
> +
> +static PyTypeObject lazymanifestType = {
> +       PyObject_HEAD_INIT(NULL)
> +       0,                                                /* ob_size */
> +       "parsers.lazymanifest",                           /* tp_name */
> +       sizeof(lazymanifest),                             /* tp_basicsize
> */
> +       0,                                                /* tp_itemsize */
> +       (destructor)lazymanifest_dealloc,                 /* tp_dealloc */
> +       0,                                                /* tp_print */
> +       0,                                                /* tp_getattr */
> +       0,                                                /* tp_setattr */
> +       0,                                                /* tp_compare */
> +       0,                                                /* tp_repr */
> +       0,                                                /* tp_as_number
> */
> +       &lazymanifest_seq_meths,                          /*
> tp_as_sequence */
> +       &lazymanifest_mapping_methods,                    /* tp_as_mapping
> */
> +       0,                                                /* tp_hash */
> +       0,                                                /* tp_call */
> +       0,                                                /* tp_str */
> +       0,                                                /* tp_getattro */
> +       0,                                                /* tp_setattro */
> +       0,                                                /* tp_as_buffer
> */
> +       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_SEQUENCE_IN, /* tp_flags */
> +       "TODO(augie)",                                    /* tp_doc */
> +       0,                                                /* tp_traverse */
> +       0,                                                /* tp_clear */
> +       0,                                                /*
> tp_richcompare */
> +       0,                                             /*
> tp_weaklistoffset */
> +       (getiterfunc)lazymanifest_getiter,                /* tp_iter */
> +       0,                                                /* tp_iternext */
> +       lazymanifest_methods,                             /* tp_methods */
> +       0,                                                /* tp_members */
> +       0,                                                /* tp_getset */
> +       0,                                                /* tp_base */
> +       0,                                                /* tp_dict */
> +       0,                                                /* tp_descr_get
> */
> +       0,                                                /* tp_descr_set
> */
> +       0,                                                /* tp_dictoffset
> */
> +       (initproc)lazymanifest_init,                      /* tp_init */
> +       0,                                                /* tp_alloc */
> +};
> +
> +void manifest_module_init(PyObject * mod)
> +{
> +       lazymanifestType.tp_new = PyType_GenericNew;
> +       if (PyType_Ready(&lazymanifestType) < 0)
> +               return;
> +       Py_INCREF(&lazymanifestType);
> +
> +       PyModule_AddObject(mod, "lazymanifest",
> +                          (PyObject *)&lazymanifestType);
> +}
> diff --git a/mercurial/parsers.c b/mercurial/parsers.c
> --- a/mercurial/parsers.c
> +++ b/mercurial/parsers.c
> @@ -71,7 +71,7 @@ static inline int hexdigit(const char *p
>  /*
>   * Turn a hex-encoded string into binary.
>   */
> -static PyObject *unhexlify(const char *str, int len)
> +PyObject *unhexlify(const char *str, int len)
>  {
>         PyObject *ret;
>         char *d;
> @@ -2233,6 +2233,7 @@ static PyMethodDef methods[] = {
>  };
>
>  void dirs_module_init(PyObject *mod);
> +void manifest_module_init(PyObject *mod);
>
>  static void module_init(PyObject *mod)
>  {
> @@ -2247,6 +2248,7 @@ static void module_init(PyObject *mod)
>         PyModule_AddStringConstant(mod, "versionerrortext",
> versionerrortext);
>
>         dirs_module_init(mod);
> +       manifest_module_init(mod);
>
>         indexType.tp_new = PyType_GenericNew;
>         if (PyType_Ready(&indexType) < 0 ||
> diff --git a/setup.py b/setup.py
> --- a/setup.py
> +++ b/setup.py
> @@ -491,6 +491,7 @@ extmodules = [
>      Extension('mercurial.mpatch', ['mercurial/mpatch.c'],
>                depends=common_depends),
>      Extension('mercurial.parsers', ['mercurial/dirs.c',
> +                                    'mercurial/manifest.c',
>                                      'mercurial/parsers.c',
>                                      'mercurial/pathencode.c'],
>                depends=common_depends),
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-manifest.py
> @@ -0,0 +1,209 @@
> +import binascii
> +import unittest
> +import itertools
> +
> +import silenttestrunner
> +
> +from mercurial import parsers
> +
> +HASH_1 = '1' * 40
> +HASH_2 = 'f' * 40
> +HASH_3 = '1234567890abcdef0987654321deadbeef0fcafe'
> +A_SHORT_MANIFEST = (
> +    'bar/baz/qux.py\0%(hash2)s%(flag2)s\n'
> +    'foo\0%(hash1)s%(flag1)s\n'
> +    ) % {'hash1': HASH_1,
> +         'flag1': '',
> +         'hash2': HASH_2,
> +         'flag2': 'l',
> +         }
> +
> +HUGE_MANIFEST_ENTRIES = 200001
> +
> +A_HUGE_MANIFEST = ''.join(sorted(
> +    'file%d\0%s%s\n' % (i, h, f) for i, h, f in
> +    itertools.izip(xrange(200001),
> +                   itertools.cycle((HASH_1, HASH_2)),
> +                   itertools.cycle(('', 'x', 'l')))))
> +
> +class testmanifest(unittest.TestCase):
> +    def testEmptyManifest(self):
> +        m = parsers.lazymanifest('')
> +        self.assertEqual(0, len(m))
> +        self.assertEqual([], list(m))
> +
> +    def testManifest(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        want = [
> +            ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
> +            ('foo', binascii.unhexlify(HASH_1), ''),
> +            ]
> +        self.assertEqual(len(want), len(m))
> +        self.assertEqual(want, list(m))
> +        self.assertEqual((binascii.unhexlify(HASH_1), ''), m['foo'])
> +        self.assertRaises(KeyError, lambda : m['wat'])
> +        self.assertEqual((binascii.unhexlify(HASH_2), 'l'),
> +                         m['bar/baz/qux.py'])
> +
> +    def testSetItem(self):
> +        want = binascii.unhexlify(HASH_1), ''
> +
> +        m = parsers.lazymanifest('')
> +        m['a'] = want
> +        self.assertIn('a', m)
> +        self.assertEqual(want, m['a'])
> +        self.assertEqual('a\0' + HASH_1 + '\n', m.text())
> +
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m['a'] = want
> +        self.assertEqual(want, m['a'])
> +        self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
> +                         m.text())
> +        m2 = m.copy()
> +        del m
> +        del m2 # make sure we don't double free() anything
> +
> +    def testCompaction(self):
> +        unhex = binascii.unhexlify
> +        h1, h2 = unhex(HASH_1), unhex(HASH_2)
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m['alpha'] = h1, ''
> +        m['beta'] = h2, ''
> +        del m['foo']
> +        want = 'alpha\0%s\nbar/baz/qux.py\0%sl\nbeta\0%s\n' % (
> +            HASH_1, HASH_2, HASH_2)
> +        self.assertEqual(want, m.text())
> +        self.assertEqual(3, len(m))
> +        self.assertEqual((h1, ''), m['alpha'])
> +        self.assertEqual((h2, ''), m['beta'])
> +        self.assertRaises(KeyError, lambda : m['foo'])
> +        w = [('alpha', h1, ''), ('bar/baz/qux.py', h2, 'l'), ('beta', h2,
> '')]
> +        self.assertEqual(w, list(m))
> +
> +    def testSetGetNodeSuffix(self):
> +        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        h, f = m['foo']
> +        want = h + 'a', f
> +        # Merge code wants to set 21-byte fake hashes at times
> +        m['foo'] = want
> +        self.assertEqual(want, m['foo'])
> +        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2),
> 'l'),
> +                          ('foo', binascii.unhexlify(HASH_1) + 'a', '')],
> +                         list(m))
> +        # Sometimes it even tries a 22-byte fake hash, but we can
> +        # return 21 and it'll work out
> +        m['foo'] = want[0] + '+', f
> +        self.assertEqual(want, m['foo'])
> +        # make sure the suffix survives a copy
> +        m2 = m.filtercopy(lambda x: x == 'foo')
> +        self.assertEqual(want, m2['foo'])
> +        m2 = m.copy()
> +        self.assertEqual(want, m2['foo'])
> +        # suffix with iteration
> +        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2),
> 'l'),
> +                          ('foo', want[0], '')], list(m))
> +        # shows up in diff
> +        self.assertEqual({'foo': (want, (h, ''))}, m.diff(clean))
> +        self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
> +
> +    def testFilterCopyException(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        def filt(path):
> +            if path == 'foo':
> +                assert False
> +            return True
> +        self.assertRaises(AssertionError, m.filtercopy, filt)
> +
> +    def testRemoveItem(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        del m['foo']
> +        self.assertRaises(KeyError, lambda : m['foo'])
> +        self.assertEqual(1, len(m))
> +        self.assertEqual(1, len(list(m)))
> +
> +    def testManifestDiff(self):
> +        MISSING = (None, '')
> +        addl = 'z-only-in-left\0' + HASH_1 + '\n'
> +        addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
> +        left = parsers.lazymanifest(
> +            A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
> +        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
> +        want = {
> +            'foo': ((binascii.unhexlify(HASH_3), 'x'),
> +                    (binascii.unhexlify(HASH_1), '')),
> +            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            'z-only-in-right': (MISSING, (binascii.unhexlify(HASH_2),
> 'x')),
> +            }
> +        self.assertEqual(want, left.diff(right))
> +
> +        want = {
> +            'bar/baz/qux.py': (MISSING, (binascii.unhexlify(HASH_2),
> 'l')),
> +            'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
> +            'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, parsers.lazymanifest('').diff(left))
> +
> +        want = {
> +            'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'),
> MISSING),
> +            'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
> +            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            }
> +        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
> +        copy = right.copy()
> +        del copy['z-only-in-right']
> +        del right['foo']
> +        want = {
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            'z-only-in-right': ((binascii.unhexlify(HASH_2), 'x'),
> MISSING),
> +            }
> +        self.assertEqual(want, right.diff(copy))
> +
> +        short = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        pruned = short.copy()
> +        del pruned['foo']
> +        want = {
> +            'foo': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            }
> +        self.assertEqual(want, short.diff(pruned))
> +        want = {
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, pruned.diff(short))
> +        want = {
> +            'bar/baz/qux.py': None,
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, pruned.diff(short, True))
> +
> +    def testReversedLines(self):
> +        backwards = ''.join(
> +            l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if
> l)
> +        try:
> +            parsers.lazymanifest(backwards)
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertEqual('Manifest lines not in sorted order.',
> v.message)
> +
> +    def testNoTerminalNewline(self):
> +        try:
> +            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertEqual('Manifest did not end in a newline.',
> v.message)
> +
> +    def testNoNewLineAtAll(self):
> +        try:
> +            parsers.lazymanifest('wat')
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertEqual('Manifest did not end in a newline.',
> v.message)
> +
> +    def testHugeManifest(self):
> +        m = parsers.lazymanifest(A_HUGE_MANIFEST)
> +        self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
> +        self.assertEqual(len(m), len(list(m)))
> +
> +
> +if __name__ == '__main__':
> +    silenttestrunner.main(__name__)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>
Augie Fackler - Jan. 14, 2015, 2:11 a.m.
On Jan 13, 2015, at 7:42 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:

> As I reported on IRC, the 'setitem' method seems slow. On the Mozilla repo, running
> 
>   hg co -C 1813b && hg mv -q intl i18n && time hg ci -qm 'move intl'
> 
> takes 6.3s with current hg and 1m21s with these patches applied. It may very be related to your TODO in setitem.

Yeah, I think this is the point where we should drop v2, and I’ll work on a bulk-add function (shouldn’t take long) to mitigate that problem.

Always an edge case.
Martin von Zweigbergk - Jan. 14, 2015, 2:22 a.m.
On Tue, Jan 13, 2015, 18:11 Augie Fackler <raf@durin42.com> wrote:


On Jan 13, 2015, at 7:42 PM, Martin von Zweigbergk <martinvonz@google.com>
wrote:

> As I reported on IRC, the 'setitem' method seems slow. On the Mozilla
repo, running
>
>   hg co -C 1813b && hg mv -q intl i18n && time hg ci -qm 'move intl'
>
> takes 6.3s with current hg and 1m21s with these patches applied. It may
very be related to your TODO in setitem.

Yeah, I think this is the point where we should drop v2, and I’ll work on a
bulk-add function (shouldn’t take long) to mitigate that problem.



 I think I once suggested making it transparent to the user by instead
storing modifications separately and then compacting them on any query
operation. That should effectively be the same (performance-wise) as an
explicit bulk update operation, but without the need to update any callers.
Augie Fackler - Jan. 14, 2015, 2:24 a.m.
On Jan 13, 2015, at 9:22 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:

> 
> 
> On Tue, Jan 13, 2015, 18:11 Augie Fackler <raf@durin42.com> wrote:
> 
> 
> On Jan 13, 2015, at 7:42 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:
> 
> > As I reported on IRC, the 'setitem' method seems slow. On the Mozilla repo, running
> >
> >   hg co -C 1813b && hg mv -q intl i18n && time hg ci -qm 'move intl'
> >
> > takes 6.3s with current hg and 1m21s with these patches applied. It may very be related to your TODO in setitem.
> 
> Yeah, I think this is the point where we should drop v2, and I’ll work on a bulk-add function (shouldn’t take long) to mitigate that problem.
> 
> 
> 
> I think I once suggested making it transparent to the user by instead storing modifications separately and then compacting them on any query operation. That should effectively be the same (performance-wise) as an explicit bulk update operation, but without the need to update any callers.

But if someone inserts a new entry, then does it again, the binary search won’t work, which makes things frustrating. If you’ve got an idea for that, I’d love to hear it though (I haven’t come up with anything better.)
Martin von Zweigbergk - Jan. 14, 2015, 2:35 a.m.
If updates are in bulk without queries in between, it should be possible to
just append the updates to the list. Then, on query, sort with a stable
sort, pick the last entry for each file and drop the rest. Makes sense?

If queries are interleaved, we would have a problem anyway and would have
to do something smarter.

On Tue, Jan 13, 2015, 18:24 Augie Fackler <raf@durin42.com> wrote:

>
> On Jan 13, 2015, at 9:22 PM, Martin von Zweigbergk <martinvonz@google.com>
> wrote:
>
> >
> >
> > On Tue, Jan 13, 2015, 18:11 Augie Fackler <raf@durin42.com> wrote:
> >
> >
> > On Jan 13, 2015, at 7:42 PM, Martin von Zweigbergk <
> martinvonz@google.com> wrote:
> >
> > > As I reported on IRC, the 'setitem' method seems slow. On the Mozilla
> repo, running
> > >
> > >   hg co -C 1813b && hg mv -q intl i18n && time hg ci -qm 'move intl'
> > >
> > > takes 6.3s with current hg and 1m21s with these patches applied. It
> may very be related to your TODO in setitem.
> >
> > Yeah, I think this is the point where we should drop v2, and I’ll work
> on a bulk-add function (shouldn’t take long) to mitigate that problem.
> >
> >
> >
> > I think I once suggested making it transparent to the user by instead
> storing modifications separately and then compacting them on any query
> operation. That should effectively be the same (performance-wise) as an
> explicit bulk update operation, but without the need to update any callers.
>
> But if someone inserts a new entry, then does it again, the binary search
> won’t work, which makes things frustrating. If you’ve got an idea for that,
> I’d love to hear it though (I haven’t come up with anything better.)
>
Augie Fackler - Jan. 14, 2015, 2:40 a.m.
On Jan 13, 2015, at 9:35 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:

> If updates are in bulk without queries in between, it should be possible to just append the updates to the list. Then, on query, sort with a stable sort, pick the last entry for each file and drop the rest. Makes sense?

Yes, but how do we tell if the code is doing something like

m[‘foo’] = node1
m[‘foo’] = node2

which would fail if we inserted ‘foo’ on the first call and then didn’t manage to realize we already had an (unsorted) line for foo, but still appears to be just insertions.

If we have a bulk-insert code path, we can avoid that part of the problem.

> 
> If queries are interleaved, we would have a problem anyway and would have to do something smarter.
> 
> 
> On Tue, Jan 13, 2015, 18:24 Augie Fackler <raf@durin42.com> wrote:
> 
> On Jan 13, 2015, at 9:22 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:
> 
> >
> >
> > On Tue, Jan 13, 2015, 18:11 Augie Fackler <raf@durin42.com> wrote:
> >
> >
> > On Jan 13, 2015, at 7:42 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:
> >
> > > As I reported on IRC, the 'setitem' method seems slow. On the Mozilla repo, running
> > >
> > >   hg co -C 1813b && hg mv -q intl i18n && time hg ci -qm 'move intl'
> > >
> > > takes 6.3s with current hg and 1m21s with these patches applied. It may very be related to your TODO in setitem.
> >
> > Yeah, I think this is the point where we should drop v2, and I’ll work on a bulk-add function (shouldn’t take long) to mitigate that problem.
> >
> >
> >
> > I think I once suggested making it transparent to the user by instead storing modifications separately and then compacting them on any query operation. That should effectively be the same (performance-wise) as an explicit bulk update operation, but without the need to update any callers.
> 
> But if someone inserts a new entry, then does it again, the binary search won’t work, which makes things frustrating. If you’ve got an idea for that, I’d love to hear it though (I haven’t come up with anything better.)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
Martin von Zweigbergk - Jan. 14, 2015, 5:42 a.m.
On Tue, Jan 13, 2015 at 6:40 PM, Augie Fackler <raf@durin42.com> wrote:
>
> On Jan 13, 2015, at 9:35 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:
>
>> If updates are in bulk without queries in between, it should be possible to just append the updates to the list. Then, on query, sort with a stable sort, pick the last entry for each file and drop the rest. Makes sense?
>
> Yes, but how do we tell if the code is doing something like
>
> m[‘foo’] = node1
> m[‘foo’] = node2
>
> which would fail if we inserted ‘foo’ on the first call and then didn’t manage to realize we already had an (unsorted) line for foo, but still appears to be just insertions.

I'm not sure I follow, but let me explain what I meant. Let's say we
initially have:

bar=node3
foo=node0
qux=node4

Then your two __setitem__ calls happen and we have

bar=node3
foo=node0
qux=node4
foo=node1
foo=node2

On query (__iter__, __getitem__, __contains__), we see that the dirty
bit is set and we first sort:

bar=node3
foo=node0
foo=node1
foo=node2
qux=node4

Then we keep only the last entry for each file:

bar=node3
foo=node2
qux=node4

This seems to be working to me. A problem I can see with this way of
imitating the dict interface is that way would not fail on "del
m['baz']" even though that would fail on a dict. Doesn't seems like
much of a problem in practice, but I may be wrong.

Either way, do we know that updates generally are not interleaved with
queries? I haven't bothered checking. If they are interleaved, perhaps
we could rewrite them to be less so.


>
> If we have a bulk-insert code path, we can avoid that part of the problem.
>
>>
>> If queries are interleaved, we would have a problem anyway and would have to do something smarter.
>>
>>
>> On Tue, Jan 13, 2015, 18:24 Augie Fackler <raf@durin42.com> wrote:
>>
>> On Jan 13, 2015, at 9:22 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:
>>
>> >
>> >
>> > On Tue, Jan 13, 2015, 18:11 Augie Fackler <raf@durin42.com> wrote:
>> >
>> >
>> > On Jan 13, 2015, at 7:42 PM, Martin von Zweigbergk <martinvonz@google.com> wrote:
>> >
>> > > As I reported on IRC, the 'setitem' method seems slow. On the Mozilla repo, running
>> > >
>> > >   hg co -C 1813b && hg mv -q intl i18n && time hg ci -qm 'move intl'
>> > >
>> > > takes 6.3s with current hg and 1m21s with these patches applied. It may very be related to your TODO in setitem.
>> >
>> > Yeah, I think this is the point where we should drop v2, and I’ll work on a bulk-add function (shouldn’t take long) to mitigate that problem.
>> >
>> >
>> >
>> > I think I once suggested making it transparent to the user by instead storing modifications separately and then compacting them on any query operation. That should effectively be the same (performance-wise) as an explicit bulk update operation, but without the need to update any callers.
>>
>> But if someone inserts a new entry, then does it again, the binary search won’t work, which makes things frustrating. If you’ve got an idea for that, I’d love to hear it though (I haven’t come up with anything better.)
>> _______________________________________________
>> Mercurial-devel mailing list
>> Mercurial-devel@selenic.com
>> http://selenic.com/mailman/listinfo/mercurial-devel
>
Ryan McElroy - Jan. 14, 2015, 6:32 p.m.
On 1/12/2015 12:04 PM, Augie Fackler wrote:
> # HG changeset patch
> # User Augie Fackler <augie@google.com>
> # Date 1420871231 28800
> #      Fri Jan 09 22:27:11 2015 -0800
> # Node ID b9d67bf3ea8196c8530afdcf6b1727682b681259
> # Parent  678f53865c6860a950392691814766957ee89316
> manifest.c: new extension code to lazily parse manifests
>
> This lets us iterate manifests in order, but do a _lot_ less work in
> the common case when we only care about a few manifest entries.
>
> Many thanks to Mike Edgar for reviewing this in advance of it going
> out to the list, which caught many things I missed.
>
> This version of the patch includes C89 fixes from Sean Farley and
> many correctness/efficiency cleanups from Martin von
> Zweigbergk. Thanks to both!
>
I went through the deploy process at Facebook but tripped over a number 
of issues:

* hgsubversion tests fail with errors related to lazymanifest
* evolve tests fail with errors related to lazymanifest
* hg-git tests fail for unrelated reasons

So we won't be deploying this immediately, and won't be able to get real 
world testing in. I'd suggest holding off on getting this in before the cut.

That being said, the performance numbers on our big www repo look great, 
especially with some additional caching we're doing here (buildstatus 
and pathcopies are memoized).

~Ryan
Augie Fackler - Jan. 14, 2015, 6:35 p.m.
On Wed, Jan 14, 2015 at 1:32 PM, Ryan McElroy <rm@fb.com> wrote:
> That being said, the performance numbers on our big www repo look great,
> especially with some additional caching we're doing here (buildstatus and
> pathcopies are memoized).


OOC, how great?
Ryan McElroy - Jan. 14, 2015, 10:29 p.m.
On 1/14/2015 10:35 AM, Augie Fackler wrote:
> On Wed, Jan 14, 2015 at 1:32 PM, Ryan McElroy <rm@fb.com> wrote:
>> That being said, the performance numbers on our big www repo look great,
>> especially with some additional caching we're doing here (buildstatus and
>> pathcopies are memoized).
>
> OOC, how great?
1) typical vanilla hg export on my dev box:
real    0m2.143s
user    0m1.819s
sys     0m0.319s

2) hg export with lazymanifest:
real    0m1.523s
user    0m1.236s
sys     0m0.280s

3) hg export with simplecache extension:
real    0m0.976s
user    0m0.523s
sys     0m0.214s

4) hg export with lazymanifest and simplecache:
real    0m0.577s
user    0m0.369s
sys     0m0.203s

This is on a FusionIO mounted copy of our big php repository.

Patch

diff --git a/mercurial/manifest.c b/mercurial/manifest.c
new file mode 100644
--- /dev/null
+++ b/mercurial/manifest.c
@@ -0,0 +1,865 @@ 
+/*
+ * manifest.c - manifest type that does on-demand parsing.
+ *
+ * Copyright 2015, Google Inc.
+ *
+ * This software may be used and distributed according to the terms of
+ * the GNU General Public License, incorporated herein by reference.
+ */
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <Python.h>
+
+/* VC9 doesn't include bool and lacks stdbool.h based on my searching */
+#ifdef _MSC_VER
+#define true 1
+#define false 0
+typedef unsigned char bool;
+#else
+#include <stdbool.h>
+#endif
+
+#define DEFAULT_LINES 100000
+
+typedef struct {
+	char *start;
+	Py_ssize_t len; /* length of line not including terminal newline */
+	char hash_suffix;
+	bool from_malloc;
+	bool deleted;
+} line;
+
+typedef struct {
+	PyObject_HEAD
+	PyObject *pydata;
+	line *lines;
+	int numlines; /* number of line entries */
+	int livelines; /* number of non-deleted lines */
+	int maxlines; /* allocated number of lines */
+	bool dirty;
+} lazymanifest;
+
+#define MANIFEST_OOM -1
+#define MANIFEST_NOT_SORTED -2
+#define MANIFEST_MALFORMED -3
+
+/* defined in parsers.c */
+PyObject *unhexlify(const char *str, int len);
+
+/* get the length of the path for a line */
+static size_t pathlen(line *l) {
+	return strnlen(l->start, l->len);
+}
+
+/* get the node value of a single line */
+static PyObject *nodeof(line *l) {
+	char *s = l->start;
+	ssize_t llen = pathlen(l);
+	PyObject *hash = unhexlify(s + llen + 1, 40);
+	if (!hash) {
+		return NULL;
+	}
+	if (l->hash_suffix != '\0') {
+		char newhash[21];
+		memcpy(newhash, PyString_AsString(hash), 20);
+		Py_DECREF(hash);
+		newhash[20] = l->hash_suffix;
+		hash = PyString_FromStringAndSize(newhash, 21);
+	}
+	return hash;
+}
+
+/* get the node hash and flags of a line as a tuple */
+static PyObject *hashflags(line *l)
+{
+	char *s = l->start;
+	size_t plen = pathlen(l);
+	PyObject *hash = nodeof(l);
+
+	/* 40 for hash, 1 for null byte, 1 for newline */
+	size_t hplen = plen + 42;
+	Py_ssize_t flen = l->len - hplen;
+	PyObject *flags;
+	PyObject *tup;
+
+	if (!hash)
+		return NULL;
+	if (flen > 0) {
+		flags = PyString_FromStringAndSize(s + hplen - 1, flen);
+	} else {
+		flags = PyString_FromString("");
+	}
+	if (!flags) {
+		Py_DECREF(hash);
+		return NULL;
+	}
+	tup = PyTuple_Pack(2, hash, flags);
+	Py_DECREF(flags);
+	Py_DECREF(hash);
+	return tup;
+}
+
+/* if we're about to run out of space in the line index, add more */
+static bool realloc_if_full(lazymanifest *self)
+{
+	if (self->numlines == self->maxlines) {
+		self->maxlines *= 2;
+		self->lines = realloc(self->lines, self->maxlines * sizeof(line));
+	}
+	return self->lines;
+}
+
+/*
+ * Find the line boundaries in the manifest that 'data' points to and store
+ * information about each line in 'self'.
+ */
+static int find_lines(lazymanifest *self, char *data, Py_ssize_t len)
+{
+	char *prev = NULL;
+	line *l = NULL;
+	while (len > 0) {
+		char *next = memchr(data, '\n', len);
+		if (!next) {
+			return MANIFEST_MALFORMED;
+		}
+		next++; /* advance past newline */
+		if (!realloc_if_full(self)) {
+			return MANIFEST_OOM; /* no memory */
+		}
+		if (prev && strcmp(prev, data) > -1) {
+			/* This data isn't sorted, so we have to abort. */
+			return MANIFEST_NOT_SORTED;
+		}
+		l = self->lines + ((self->numlines)++);
+		l->start = data;
+		l->len = next - data;
+		l->hash_suffix = '\0';
+		l->from_malloc = false;
+		l->deleted = false;
+		len = len - l->len;
+		prev = data;
+		data = next;
+	}
+	self->livelines = self->numlines;
+	return 0;
+}
+
+static int lazymanifest_init(lazymanifest *self, PyObject *args)
+{
+	char *data;
+	Py_ssize_t len;
+	int err, ret;
+	PyObject *pydata;
+	if (!PyArg_ParseTuple(args, "S", &pydata)) {
+		return -1;
+	}
+	err = PyString_AsStringAndSize(pydata, &data, &len);
+
+	self->dirty = false;
+	if (err == -1)
+		return -1;
+	self->pydata = pydata;
+	Py_INCREF(self->pydata);
+	Py_BEGIN_ALLOW_THREADS
+	self->lines = malloc(DEFAULT_LINES * sizeof(line));
+	self->maxlines = DEFAULT_LINES;
+	self->numlines = 0;
+	if (!self->lines)
+		ret = -2;
+	else
+		ret = find_lines(self, data, len);
+	Py_END_ALLOW_THREADS
+	switch (ret) {
+	case 0:
+		break;
+	case MANIFEST_OOM:
+		PyErr_NoMemory();
+		break;
+	case MANIFEST_NOT_SORTED:
+		PyErr_Format(PyExc_ValueError,
+		             "Manifest lines not in sorted order.");
+		break;
+	case MANIFEST_MALFORMED:
+		PyErr_Format(PyExc_ValueError,
+		             "Manifest did not end in a newline.");
+		break;
+	default:
+		PyErr_Format(PyExc_ValueError,
+		             "Unknown problem parsing manifest.");
+	}
+	return ret == 0 ? 0 : -1;
+}
+
+static void lazymanifest_dealloc(lazymanifest *self)
+{
+	/* free any extra lines we had to allocate */
+	int i;
+	for (i = 0; i < self->numlines; i++) {
+		if (self->lines[i].from_malloc) {
+			free(self->lines[i].start);
+		}
+	}
+	if (self->lines) {
+		free(self->lines);
+		self->lines = NULL;
+	}
+	if (self->pydata) {
+		Py_DECREF(self->pydata);
+		self->pydata = NULL;
+	}
+	PyObject_Del(self);
+}
+
+/* iteration support */
+
+typedef struct {
+	PyObject_HEAD lazymanifest *m;
+	Py_ssize_t pos;
+} lmIter;
+
+static void lmiter_dealloc(PyObject *o)
+{
+	lmIter *self = (lmIter *)o;
+	Py_DECREF(self->m);
+	PyObject_Del(self);
+}
+
+static PyObject *lmiter_iternext(PyObject *o)
+{
+	size_t pl;
+	line *l;
+	Py_ssize_t consumed;
+	PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL;
+	lmIter *self = (lmIter *)o;
+	Py_ssize_t index = self->pos++;
+	/* skip over deleted manifest entries */
+	while (index < self->m->numlines &&
+	       self->m->lines[index].deleted) {
+		index = ++(self->pos);
+	}
+	if (self->m->numlines <= index) {
+		goto bail;
+	}
+	l = &(self->m->lines[index]);
+	pl = pathlen(l);
+	path = PyString_FromStringAndSize(l->start, pl);
+	hash = nodeof(l);
+	consumed = pl + 41;
+	if (l->len > (consumed + 1)) {
+		flags = PyString_FromStringAndSize(l->start + consumed,
+		                                   l->len - consumed - 1);
+	} else {
+		flags = PyString_FromString("");
+
+	}
+	if (!flags) {
+		goto bail;
+	}
+	ret = PyTuple_Pack(3, path, hash, flags);
+ bail:
+	Py_XDECREF(path);
+	Py_XDECREF(hash);
+	Py_XDECREF(flags);
+	return ret;
+}
+
+static PyTypeObject lazymanifestIterator = {
+	PyObject_HEAD_INIT(NULL)
+	0,                               /*ob_size */
+	"parsers.lazymanifest.iterator", /*tp_name */
+	sizeof(lmIter),                  /*tp_basicsize */
+	0,                               /*tp_itemsize */
+	lmiter_dealloc,                  /*tp_dealloc */
+	0,                               /*tp_print */
+	0,                               /*tp_getattr */
+	0,                               /*tp_setattr */
+	0,                               /*tp_compare */
+	0,                               /*tp_repr */
+	0,                               /*tp_as_number */
+	0,                               /*tp_as_sequence */
+	0,                               /*tp_as_mapping */
+	0,                               /*tp_hash */
+	0,                               /*tp_call */
+	0,                               /*tp_str */
+	0,                               /*tp_getattro */
+	0,                               /*tp_setattro */
+	0,                               /*tp_as_buffer */
+	/* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
+	   use tp_iter and tp_iternext fields. */
+	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
+	"Iterator for a lazymanifest.",  /* tp_doc */
+	0,                               /* tp_traverse */
+	0,                               /* tp_clear */
+	0,                               /* tp_richcompare */
+	0,                               /* tp_weaklistoffset */
+	PyObject_SelfIter,               /* tp_iter: __iter__() method */
+	lmiter_iternext,                 /* tp_iternext: next() method */
+};
+
+static lazymanifest *lazymanifest_copy(
+	lazymanifest *self, PyObject *unused_args);
+
+static PyObject *lazymanifest_getiter(lazymanifest *self)
+{
+	lmIter *i = NULL;
+	lazymanifest *t = lazymanifest_copy(self, NULL);
+	if (!t) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+	i = PyObject_New(lmIter, &lazymanifestIterator);
+	if (i) {
+		i->m = t;
+		i->pos = 0;
+	} else {
+		Py_DECREF(t);
+		PyErr_NoMemory();
+	}
+	return (PyObject *)i;
+}
+
+/* __getitem__ and __setitem__ support */
+
+static Py_ssize_t lazymanifest_size(lazymanifest *self)
+{
+	return self->livelines;
+}
+
+static int linecmp(const void *left, const void *right)
+{
+	return strcmp(((const line *)left)->start,
+	              ((const line *)right)->start);
+}
+
+static PyObject *lazymanifest_getitem(lazymanifest *self, PyObject *key)
+{
+	line needle;
+	line *hit;
+	if (!PyString_Check(key)) {
+		PyErr_Format(PyExc_TypeError,
+		             "getitem: manifest keys must be a string.");
+		return NULL;
+	}
+	needle.start = PyString_AsString(key);
+	needle.len = 0;
+	hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
+	              &linecmp);
+	if (!hit || hit->deleted) {
+		PyErr_Format(PyExc_KeyError, "No such manifest entry.");
+		return NULL;
+	}
+	return hashflags(hit);
+}
+
+static int lazymanifest_delitem(lazymanifest *self, PyObject *key)
+{
+	line needle;
+	line *hit;
+	if (!PyString_Check(key)) {
+		PyErr_Format(PyExc_TypeError,
+		             "delitem: manifest keys must be a string.");
+		return -1;
+	}
+	needle.start = PyString_AsString(key);
+	needle.len = 0;
+	hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
+	              &linecmp);
+	if (!hit || hit->deleted) {
+		PyErr_Format(PyExc_KeyError,
+		             "Tried to delete nonexistent manifest entry.");
+		return -1;
+	}
+	self->dirty = true;
+	hit->deleted = true;
+	self->livelines--;
+	return 0;
+}
+
+static int lazymanifest_setitem(
+	lazymanifest *self, PyObject *key, PyObject *value)
+{
+	char *path;
+	Py_ssize_t plen;
+	PyObject *pyhash;
+	Py_ssize_t hlen;
+	char *hash;
+	char hashextra;
+	PyObject *pyflags;
+	char *flags;
+	Py_ssize_t flen;
+	size_t dlen;
+	char *dest;
+	int i;
+	line new;
+	line *hit;
+	if (!PyString_Check(key)) {
+		PyErr_Format(PyExc_TypeError,
+		             "setitem: manifest keys must be a string.");
+		return -1;
+	}
+	if (!value) {
+		return lazymanifest_delitem(self, key);
+	}
+	if (!PyTuple_Check(value) || PyTuple_Size(value) != 2) {
+		PyErr_Format(PyExc_TypeError,
+		             "Manifest values must be a tuple of (node, flags).");
+		return -1;
+	}
+	if (PyString_AsStringAndSize(key, &path, &plen) == -1) {
+		return -1;
+	}
+
+	pyhash = PyTuple_GetItem(value, 0);
+	if (!PyString_Check(pyhash)) {
+		PyErr_Format(PyExc_TypeError,
+		             "node must be a 20-byte string");
+		return -1;
+	}
+	hlen = PyString_Size(pyhash);
+	/* Some parts of the codebase try and set 21 or 22
+	 * byte "hash" values in order to perturb things for
+	 * status. We have to preserve at least the 21st
+	 * byte. Sigh. If there's a 22nd byte, we drop it on
+	 * the floor, which works fine.
+	 */
+	if (hlen != 20 && hlen != 21 && hlen != 22) {
+		PyErr_Format(PyExc_TypeError,
+		             "node must be a 20-byte string");
+		return -1;
+	}
+	hash = PyString_AsString(pyhash);
+	hashextra = '\0';
+	if (hlen > 20) {
+		hashextra = hash[20];
+	}
+
+	pyflags = PyTuple_GetItem(value, 1);
+	if (!PyString_Check(pyflags) || PyString_Size(pyflags) > 1) {
+		PyErr_Format(PyExc_TypeError,
+		             "flags must a 0 or 1 byte string");
+		return -1;
+	}
+	if (PyString_AsStringAndSize(pyflags, &flags, &flen) == -1) {
+		return -1;
+	}
+	/* 3 -> two null bytes and one newline */
+	dlen = plen + 40 + flen + 3;
+	dest = malloc(dlen);
+	if (!dest) {
+		PyErr_NoMemory();
+		return -1;
+	}
+	memcpy(dest, path, plen + 1);
+	for (i = 0; i < 20; i++) {
+		sprintf(dest + plen + 1 + (i * 2), "%02hhx", hash[i]);
+	}
+	sprintf(dest + plen + 41, "%s\n", flags);
+	new.start = dest;
+	new.len = dlen - 1;
+	new.hash_suffix = hashextra;
+	new.from_malloc = true;     /* is `start` a pointer we allocated? */
+	new.deleted = false;        /* is this entry deleted? */
+	hit = bsearch(&new, self->lines, self->numlines,
+	              sizeof(line), &linecmp);
+	self->dirty = true;
+	if (hit) {
+		/* updating a line we already had */
+		if (hit->from_malloc) {
+			free(hit->start);
+		}
+		if (hit->deleted) {
+			self->livelines++;
+		}
+		*hit = new;
+	} else {
+		/* new line */
+		if (!realloc_if_full(self)) {
+			PyErr_NoMemory();
+			return -1;
+		}
+		self->lines[self->numlines++] = new;
+		self->livelines++;
+		/* TODO: do a binary search and insert rather than
+		 * append and qsort. Also offer a batch-insert
+		 * interface, which should be a nice little
+		 * performance win.
+		 */
+		qsort(self->lines, self->numlines, sizeof(line), &linecmp);
+	}
+	return 0;
+}
+
+static PyMappingMethods lazymanifest_mapping_methods = {
+	(lenfunc)lazymanifest_size,             /* mp_length */
+	(binaryfunc)lazymanifest_getitem,       /* mp_subscript */
+	(objobjargproc)lazymanifest_setitem,    /* mp_ass_subscript */
+};
+
+/* sequence methods (important or __contains__ builds an iterator */
+
+static int lazymanifest_contains(lazymanifest *self, PyObject *key)
+{
+	line needle;
+	line *hit;
+	if (!PyString_Check(key)) {
+		PyErr_Format(PyExc_TypeError,
+		             "contains: manifest keys must be a string.");
+		return 0;
+	}
+	needle.start = PyString_AsString(key);
+	needle.len = 0;
+	hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
+	              &linecmp);
+	if (!hit || hit->deleted) {
+		return 0;
+	}
+	return 1;
+}
+
+static PySequenceMethods lazymanifest_seq_meths = {
+	(lenfunc)lazymanifest_size, /* sq_length */
+	0, /* sq_concat */
+	0, /* sq_repeat */
+	0, /* sq_item */
+	0, /* sq_slice */
+	0, /* sq_ass_item */
+	0, /* sq_ass_slice */
+	(objobjproc)lazymanifest_contains, /* sq_contains */
+	0, /* sq_inplace_concat */
+	0, /* sq_inplace_repeat */
+};
+
+
+/* Other methods (copy, diff, etc) */
+static PyTypeObject lazymanifestType;
+
+/* If the manifest has changes, build the new manifest text and reindex it. */
+static int compact(lazymanifest *self) {
+	int i;
+	ssize_t need = 0;
+	char *data;
+	ssize_t copied = 0;
+	int real = 0;
+	line *src, *dst;
+	char *tofree;
+	ssize_t c;
+	PyObject *pydata;
+	char *realdest;
+	int err;
+	Py_ssize_t offset;
+	line new;
+	if (!self->dirty)
+		return 0;
+	for (i = 0; i < self->numlines; i++) {
+		if (!self->lines[i].deleted) {
+			need += self->lines[i].len;
+		}
+	}
+	pydata = PyString_FromStringAndSize(NULL, need);
+	if (!pydata)
+		return -1;
+	data = PyString_AsString(pydata);
+	if (!data) {
+		return -1;
+	}
+	src = self->lines;
+	dst = self->lines;
+	for (i = 0; i < self->numlines; i++, src++) {
+		tofree = NULL;
+		if (src->from_malloc) {
+			tofree = src->start;
+		}
+		if (!src->deleted) {
+			memcpy(data, src->start, src->len);
+			*dst = *src;
+			dst->start = data;
+			dst->from_malloc = false;
+			data += dst->len;
+			dst++;
+		}
+		free(tofree);
+	}
+	Py_DECREF(self->pydata);
+	self->pydata = pydata;
+	self->numlines = self->livelines;
+	self->dirty = false;
+	return 0;
+}
+
+static PyObject *lazymanifest_text(lazymanifest *self, PyObject *unused_args)
+{
+	if (compact(self) != 0) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+	Py_INCREF(self->pydata);
+	return self->pydata;
+}
+
+static lazymanifest *lazymanifest_copy(
+	lazymanifest *self, PyObject *unused_args)
+{
+	lazymanifest *copy = NULL;
+	if (compact(self) != 0) {
+		goto nomem;
+	}
+	copy = PyObject_New(lazymanifest, &lazymanifestType);
+	if (!copy) {
+		goto nomem;
+	}
+	copy->numlines = self->numlines;
+	copy->livelines = self->livelines;
+	copy->dirty = false;
+	copy->lines = malloc(self->maxlines *sizeof(line));
+	if (!copy->lines) {
+		goto nomem;
+	}
+	memcpy(copy->lines, self->lines, self->numlines * sizeof(line));
+	copy->maxlines = self->maxlines;
+	copy->pydata = self->pydata;
+	Py_INCREF(copy->pydata);
+	return copy;
+ nomem:
+	PyErr_NoMemory();
+	Py_XDECREF(copy);
+	return NULL;
+}
+
+static lazymanifest *lazymanifest_filtercopy(
+	lazymanifest *self, PyObject *matchfn)
+{
+	lazymanifest *tmp;
+	PyObject *arg;
+	PyObject *arglist;
+	PyObject *result;
+	int i;
+	if (!PyCallable_Check(matchfn)) {
+		PyErr_SetString(PyExc_TypeError, "matchfn must be callable");
+		return NULL;
+	}
+	/* compact ourselves first to avoid double-frees later when we
+	 * compact tmp so that it doesn't have random pointers to our
+	 * underlying data */
+	compact(self);
+	tmp = PyObject_New(lazymanifest, &lazymanifestType);
+	tmp->lines = malloc(self->maxlines * sizeof(line));
+	tmp->maxlines = self->maxlines;
+	tmp->livelines = 0;
+	tmp->numlines = 0;
+	tmp->pydata = self->pydata;
+	Py_INCREF(self->pydata);
+	for (i = 0; i < self->numlines; i++) {
+		arg = PyString_FromString(self->lines[i].start);
+		arglist = PyTuple_Pack(1, arg);
+		result = PyObject_CallObject(matchfn, arglist);
+		/* if the callback raised an exception, just let it
+		 * through and give up */
+		if (!result) {
+			free(tmp->lines);
+			tmp->lines = NULL;
+			Py_DECREF(self->pydata);
+			return NULL;
+		}
+		if (PyObject_IsTrue(result)) {
+			assert(!(self->lines[i].from_malloc));
+			tmp->lines[tmp->numlines++] = self->lines[i];
+			tmp->livelines++;
+		}
+		Py_DECREF(arglist);
+		Py_DECREF(arg);
+		Py_DECREF(result);
+	}
+	compact(tmp);
+	return tmp;
+}
+
+static PyObject *lazymanifest_diff(lazymanifest *self, PyObject *args)
+{
+	lazymanifest *other;
+	PyObject *pyclean = NULL;
+	bool clean;
+	PyObject *emptyTup = NULL, *ret = NULL;
+	PyObject *es;
+	int sneedle = 0, oneedle = 0;
+	line *left;
+	line *right;
+	int result;
+	PyObject *key;
+	PyObject *tup;
+	PyObject *outer;
+	PyObject *r;
+	if (!PyArg_ParseTuple(args, "O!|O", &lazymanifestType, &other, &pyclean)) {
+		return NULL;
+	}
+	clean = (!pyclean) ? false : PyObject_IsTrue(pyclean);
+	es = PyString_FromString("");
+	if (!es) {
+		goto nomem;
+	}
+	emptyTup = PyTuple_Pack(2, Py_None, es);
+	Py_DECREF(es);
+	if (!emptyTup) {
+		goto nomem;
+	}
+	ret = PyDict_New();
+	if (!ret) {
+		goto nomem;
+	}
+	while (sneedle != self->numlines || oneedle != other->numlines) {
+		left = self->lines + sneedle;
+		right = other->lines + oneedle;
+		/* If we're looking at a deleted entry and it's not
+		 * the end of the manifest, just skip it. */
+		if (left->deleted && sneedle < self->numlines) {
+			sneedle++;
+			continue;
+		}
+		if (right->deleted && oneedle < other->numlines) {
+			oneedle++;
+			continue;
+		}
+		/* if we're at the end of either manifest, then we
+		 * know the remaining items are adds so we can skip
+		 * the strcmp. */
+		if (sneedle == self->numlines) {
+			result = 1;
+		} else if (oneedle == other->numlines) {
+			result = -1;
+		} else {
+			result = linecmp(left, right);
+		}
+		key = result <= 0 ?
+			PyString_FromString(left->start) :
+			PyString_FromString(right->start);
+		if (!key)
+			goto nomem;
+		if (result < 0) {
+			tup = hashflags(left);
+			if (!tup) {
+				goto nomem;
+			}
+			outer = PyTuple_Pack(2, tup, emptyTup);
+			Py_DECREF(tup);
+			if (!outer) {
+				goto nomem;
+			}
+			PyDict_SetItem(ret, key, outer);
+			Py_DECREF(outer);
+			sneedle++;
+		} else if (result > 0) {
+			tup = hashflags(right);
+			if (!tup) {
+				goto nomem;
+			}
+			outer = PyTuple_Pack(2, emptyTup, tup);
+			Py_DECREF(tup);
+			if (!outer) {
+				goto nomem;
+			}
+			PyDict_SetItem(ret, key, outer);
+			Py_DECREF(outer);
+			oneedle++;
+		} else {
+			/* file exists in both manifests */
+			if (left->len != right->len
+			    || memcmp(left->start, right->start, left->len)
+			    || left->hash_suffix != right->hash_suffix) {
+				PyObject *l = hashflags(left);
+				if (!l) {
+					goto nomem;
+				}
+				r = hashflags(right);
+				if (!r) {
+					Py_DECREF(l);
+					goto nomem;
+				}
+				outer = PyTuple_Pack(2, l, r);
+				Py_DECREF(l);
+				Py_DECREF(r);
+				if (!outer) {
+					goto nomem;
+				}
+				PyDict_SetItem(ret, key, outer);
+				Py_DECREF(outer);
+			} else if (clean) {
+				PyDict_SetItem(ret, key, Py_None);
+			}
+			sneedle++;
+			oneedle++;
+		}
+		Py_DECREF(key);
+	}
+	Py_DECREF(emptyTup);
+	return ret;
+ nomem:
+	PyErr_NoMemory();
+	Py_XDECREF(ret);
+	Py_XDECREF(emptyTup);
+	return NULL;
+}
+
+static PyMethodDef lazymanifest_methods[] = {
+	{"copy", (PyCFunction)lazymanifest_copy, METH_NOARGS,
+	 "Make a copy of this lazymanifest."},
+	{"filtercopy", (PyCFunction)lazymanifest_filtercopy, METH_O,
+	 "Make a copy of this manifest filtered by matchfn."},
+	{"diff", (PyCFunction)lazymanifest_diff, METH_VARARGS,
+	 "Compare this lazymanifest to another one."},
+	{"text", (PyCFunction)lazymanifest_text, METH_NOARGS,
+	 "Encode this manifest to text."},
+	{NULL},
+};
+
+static PyTypeObject lazymanifestType = {
+	PyObject_HEAD_INIT(NULL)
+	0,                                                /* ob_size */
+	"parsers.lazymanifest",                           /* tp_name */
+	sizeof(lazymanifest),                             /* tp_basicsize */
+	0,                                                /* tp_itemsize */
+	(destructor)lazymanifest_dealloc,                 /* tp_dealloc */
+	0,                                                /* tp_print */
+	0,                                                /* tp_getattr */
+	0,                                                /* tp_setattr */
+	0,                                                /* tp_compare */
+	0,                                                /* tp_repr */
+	0,                                                /* tp_as_number */
+	&lazymanifest_seq_meths,                          /* tp_as_sequence */
+	&lazymanifest_mapping_methods,                    /* tp_as_mapping */
+	0,                                                /* tp_hash */
+	0,                                                /* tp_call */
+	0,                                                /* tp_str */
+	0,                                                /* tp_getattro */
+	0,                                                /* tp_setattro */
+	0,                                                /* tp_as_buffer */
+	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_SEQUENCE_IN, /* tp_flags */
+	"TODO(augie)",                                    /* tp_doc */
+	0,                                                /* tp_traverse */
+	0,                                                /* tp_clear */
+	0,                                                /* tp_richcompare */
+	0,                                             /* tp_weaklistoffset */
+	(getiterfunc)lazymanifest_getiter,                /* tp_iter */
+	0,                                                /* tp_iternext */
+	lazymanifest_methods,                             /* tp_methods */
+	0,                                                /* tp_members */
+	0,                                                /* tp_getset */
+	0,                                                /* tp_base */
+	0,                                                /* tp_dict */
+	0,                                                /* tp_descr_get */
+	0,                                                /* tp_descr_set */
+	0,                                                /* tp_dictoffset */
+	(initproc)lazymanifest_init,                      /* tp_init */
+	0,                                                /* tp_alloc */
+};
+
+void manifest_module_init(PyObject * mod)
+{
+	lazymanifestType.tp_new = PyType_GenericNew;
+	if (PyType_Ready(&lazymanifestType) < 0)
+		return;
+	Py_INCREF(&lazymanifestType);
+
+	PyModule_AddObject(mod, "lazymanifest",
+	                   (PyObject *)&lazymanifestType);
+}
diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -71,7 +71,7 @@  static inline int hexdigit(const char *p
 /*
  * Turn a hex-encoded string into binary.
  */
-static PyObject *unhexlify(const char *str, int len)
+PyObject *unhexlify(const char *str, int len)
 {
 	PyObject *ret;
 	char *d;
@@ -2233,6 +2233,7 @@  static PyMethodDef methods[] = {
 };
 
 void dirs_module_init(PyObject *mod);
+void manifest_module_init(PyObject *mod);
 
 static void module_init(PyObject *mod)
 {
@@ -2247,6 +2248,7 @@  static void module_init(PyObject *mod)
 	PyModule_AddStringConstant(mod, "versionerrortext", versionerrortext);
 
 	dirs_module_init(mod);
+	manifest_module_init(mod);
 
 	indexType.tp_new = PyType_GenericNew;
 	if (PyType_Ready(&indexType) < 0 ||
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -491,6 +491,7 @@  extmodules = [
     Extension('mercurial.mpatch', ['mercurial/mpatch.c'],
               depends=common_depends),
     Extension('mercurial.parsers', ['mercurial/dirs.c',
+                                    'mercurial/manifest.c',
                                     'mercurial/parsers.c',
                                     'mercurial/pathencode.c'],
               depends=common_depends),
diff --git a/tests/test-manifest.py b/tests/test-manifest.py
new file mode 100644
--- /dev/null
+++ b/tests/test-manifest.py
@@ -0,0 +1,209 @@ 
+import binascii
+import unittest
+import itertools
+
+import silenttestrunner
+
+from mercurial import parsers
+
+HASH_1 = '1' * 40
+HASH_2 = 'f' * 40
+HASH_3 = '1234567890abcdef0987654321deadbeef0fcafe'
+A_SHORT_MANIFEST = (
+    'bar/baz/qux.py\0%(hash2)s%(flag2)s\n'
+    'foo\0%(hash1)s%(flag1)s\n'
+    ) % {'hash1': HASH_1,
+         'flag1': '',
+         'hash2': HASH_2,
+         'flag2': 'l',
+         }
+
+HUGE_MANIFEST_ENTRIES = 200001
+
+A_HUGE_MANIFEST = ''.join(sorted(
+    'file%d\0%s%s\n' % (i, h, f) for i, h, f in
+    itertools.izip(xrange(200001),
+                   itertools.cycle((HASH_1, HASH_2)),
+                   itertools.cycle(('', 'x', 'l')))))
+
+class testmanifest(unittest.TestCase):
+    def testEmptyManifest(self):
+        m = parsers.lazymanifest('')
+        self.assertEqual(0, len(m))
+        self.assertEqual([], list(m))
+
+    def testManifest(self):
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        want = [
+            ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
+            ('foo', binascii.unhexlify(HASH_1), ''),
+            ]
+        self.assertEqual(len(want), len(m))
+        self.assertEqual(want, list(m))
+        self.assertEqual((binascii.unhexlify(HASH_1), ''), m['foo'])
+        self.assertRaises(KeyError, lambda : m['wat'])
+        self.assertEqual((binascii.unhexlify(HASH_2), 'l'),
+                         m['bar/baz/qux.py'])
+
+    def testSetItem(self):
+        want = binascii.unhexlify(HASH_1), ''
+
+        m = parsers.lazymanifest('')
+        m['a'] = want
+        self.assertIn('a', m)
+        self.assertEqual(want, m['a'])
+        self.assertEqual('a\0' + HASH_1 + '\n', m.text())
+
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m['a'] = want
+        self.assertEqual(want, m['a'])
+        self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
+                         m.text())
+        m2 = m.copy()
+        del m
+        del m2 # make sure we don't double free() anything
+
+    def testCompaction(self):
+        unhex = binascii.unhexlify
+        h1, h2 = unhex(HASH_1), unhex(HASH_2)
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m['alpha'] = h1, ''
+        m['beta'] = h2, ''
+        del m['foo']
+        want = 'alpha\0%s\nbar/baz/qux.py\0%sl\nbeta\0%s\n' % (
+            HASH_1, HASH_2, HASH_2)
+        self.assertEqual(want, m.text())
+        self.assertEqual(3, len(m))
+        self.assertEqual((h1, ''), m['alpha'])
+        self.assertEqual((h2, ''), m['beta'])
+        self.assertRaises(KeyError, lambda : m['foo'])
+        w = [('alpha', h1, ''), ('bar/baz/qux.py', h2, 'l'), ('beta', h2, '')]
+        self.assertEqual(w, list(m))
+
+    def testSetGetNodeSuffix(self):
+        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        h, f = m['foo']
+        want = h + 'a', f
+        # Merge code wants to set 21-byte fake hashes at times
+        m['foo'] = want
+        self.assertEqual(want, m['foo'])
+        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
+                          ('foo', binascii.unhexlify(HASH_1) + 'a', '')],
+                         list(m))
+        # Sometimes it even tries a 22-byte fake hash, but we can
+        # return 21 and it'll work out
+        m['foo'] = want[0] + '+', f
+        self.assertEqual(want, m['foo'])
+        # make sure the suffix survives a copy
+        m2 = m.filtercopy(lambda x: x == 'foo')
+        self.assertEqual(want, m2['foo'])
+        m2 = m.copy()
+        self.assertEqual(want, m2['foo'])
+        # suffix with iteration
+        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
+                          ('foo', want[0], '')], list(m))
+        # shows up in diff
+        self.assertEqual({'foo': (want, (h, ''))}, m.diff(clean))
+        self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
+
+    def testFilterCopyException(self):
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        def filt(path):
+            if path == 'foo':
+                assert False
+            return True
+        self.assertRaises(AssertionError, m.filtercopy, filt)
+
+    def testRemoveItem(self):
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        del m['foo']
+        self.assertRaises(KeyError, lambda : m['foo'])
+        self.assertEqual(1, len(m))
+        self.assertEqual(1, len(list(m)))
+
+    def testManifestDiff(self):
+        MISSING = (None, '')
+        addl = 'z-only-in-left\0' + HASH_1 + '\n'
+        addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
+        left = parsers.lazymanifest(
+            A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
+        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
+        want = {
+            'foo': ((binascii.unhexlify(HASH_3), 'x'),
+                    (binascii.unhexlify(HASH_1), '')),
+            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
+            'z-only-in-right': (MISSING, (binascii.unhexlify(HASH_2), 'x')),
+            }
+        self.assertEqual(want, left.diff(right))
+
+        want = {
+            'bar/baz/qux.py': (MISSING, (binascii.unhexlify(HASH_2), 'l')),
+            'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
+            'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
+            }
+        self.assertEqual(want, parsers.lazymanifest('').diff(left))
+
+        want = {
+            'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'), MISSING),
+            'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
+            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
+            }
+        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
+        copy = right.copy()
+        del copy['z-only-in-right']
+        del right['foo']
+        want = {
+            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
+            'z-only-in-right': ((binascii.unhexlify(HASH_2), 'x'), MISSING),
+            }
+        self.assertEqual(want, right.diff(copy))
+
+        short = parsers.lazymanifest(A_SHORT_MANIFEST)
+        pruned = short.copy()
+        del pruned['foo']
+        want = {
+            'foo': ((binascii.unhexlify(HASH_1), ''), MISSING),
+            }
+        self.assertEqual(want, short.diff(pruned))
+        want = {
+            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
+            }
+        self.assertEqual(want, pruned.diff(short))
+        want = {
+            'bar/baz/qux.py': None,
+            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
+            }
+        self.assertEqual(want, pruned.diff(short, True))
+
+    def testReversedLines(self):
+        backwards = ''.join(
+            l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if l)
+        try:
+            parsers.lazymanifest(backwards)
+            self.fail('Should have raised ValueError')
+        except ValueError, v:
+            self.assertEqual('Manifest lines not in sorted order.', v.message)
+
+    def testNoTerminalNewline(self):
+        try:
+            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
+            self.fail('Should have raised ValueError')
+        except ValueError, v:
+            self.assertEqual('Manifest did not end in a newline.', v.message)
+
+    def testNoNewLineAtAll(self):
+        try:
+            parsers.lazymanifest('wat')
+            self.fail('Should have raised ValueError')
+        except ValueError, v:
+            self.assertEqual('Manifest did not end in a newline.', v.message)
+
+    def testHugeManifest(self):
+        m = parsers.lazymanifest(A_HUGE_MANIFEST)
+        self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
+        self.assertEqual(len(m), len(list(m)))
+
+
+if __name__ == '__main__':
+    silenttestrunner.main(__name__)