Patchwork [1,of,4,v4,lazy-manifest] manifest.c: new extension code to lazily parse manifests

login
register
mail settings
Submitter Augie Fackler
Date March 3, 2015, 11:05 p.m.
Message ID <575515f3cdc7bc8cb826.1425423934@198.17.16.172.in-addr.arpa>
Download mbox | patch
Permalink /patch/7890/
State Superseded
Headers show

Comments

Augie Fackler - March 3, 2015, 11:05 p.m.
# HG changeset patch
# User Augie Fackler <augie@google.com>
# Date 1421188298 28800
#      Tue Jan 13 14:31:38 2015 -0800
# Node ID 575515f3cdc7bc8cb826ea17fa365d1f359ab88f
# Parent  390410a6545d9088dc84392009f51500e3935a1c
manifest.c: new extension code to lazily parse manifests

This lets us iterate manifests in order, but do a _lot_ less work in
the common case when we only care about a few manifest entries.

Many thanks to Mike Edgar for reviewing this in advance of it going
out to the list, which caught many things I missed.

This version of the patch includes C89 fixes from Sean Farley and
many correctness/efficiency cleanups from Martin von
Zweigbergk. Thanks to both!
Martin von Zweigbergk - March 4, 2015, 7:13 a.m.
Besides the comments below, I assume this hasn't changed much, so it still
looks good to me.

On Tue, Mar 3, 2015 at 3:08 PM Augie Fackler <raf@durin42.com> wrote:

> # HG changeset patch
> # User Augie Fackler <augie@google.com>
> # Date 1421188298 28800
> #      Tue Jan 13 14:31:38 2015 -0800
> # Node ID 575515f3cdc7bc8cb826ea17fa365d1f359ab88f
> # Parent  390410a6545d9088dc84392009f51500e3935a1c
> manifest.c: new extension code to lazily parse manifests
>
> This lets us iterate manifests in order, but do a _lot_ less work in
> the common case when we only care about a few manifest entries.
>
> Many thanks to Mike Edgar for reviewing this in advance of it going
> out to the list, which caught many things I missed.
>
> This version of the patch includes C89 fixes from Sean Farley and
> many correctness/efficiency cleanups from Martin von
> Zweigbergk. Thanks to both!
>
> diff --git a/mercurial/manifest.c b/mercurial/manifest.c
> new file mode 100644
> --- /dev/null
> +++ b/mercurial/manifest.c
> @@ -0,0 +1,841 @@
> +/*
> + * manifest.c - manifest type that does on-demand parsing.
> + *
> + * Copyright 2015, Google Inc.
> + *
> + * This software may be used and distributed according to the terms of
> + * the GNU General Public License, incorporated herein by reference.
> + */
> +#include <assert.h>
> +#include <string.h>
> +#include <stdlib.h>
> +
> +#include <Python.h>
> +
> +/* VC9 doesn't include bool and lacks stdbool.h based on my searching */
> +#ifdef _MSC_VER
> +#define true 1
> +#define false 0
> +typedef unsigned char bool;
> +#else
> +#include <stdbool.h>
> +#endif
> +
> +#define DEFAULT_LINES 100000
> +
> +typedef struct {
> +       char *start;
> +       Py_ssize_t len; /* length of line including terminal newline */
> +       char hash_suffix;
> +       bool from_malloc;
> +       bool deleted;
> +} line;
> +
> +typedef struct {
> +       PyObject_HEAD
> +       PyObject *pydata;
> +       line *lines;
> +       int numlines; /* number of line entries */
> +       int livelines; /* number of non-deleted lines */
> +       int maxlines; /* allocated number of lines */
> +       bool dirty;
> +} lazymanifest;
> +
> +#define MANIFEST_OOM -1
> +#define MANIFEST_NOT_SORTED -2
> +#define MANIFEST_MALFORMED -3
>

I think I suggested on an earlier version that these could be replaced by
only -1 and calls to PyErr_* before -1 is returned instead of having the
caller (of find_lines()) call PyErr_*.


> +
> +/* defined in parsers.c */
> +PyObject *unhexlify(const char *str, int len);
> +
> +/* get the length of the path for a line */
> +static size_t pathlen(line *l) {
> +       return strlen(l->start);
> +}
> +
> +/* get the node value of a single line */
> +static PyObject *nodeof(line *l) {
> +       char *s = l->start;
> +       ssize_t llen = pathlen(l);
> +       PyObject *hash = unhexlify(s + llen + 1, 40);
> +       if (!hash) {
> +               return NULL;
> +       }
> +       if (l->hash_suffix != '\0') {
> +               char newhash[21];
> +               memcpy(newhash, PyString_AsString(hash), 20);
> +               Py_DECREF(hash);
> +               newhash[20] = l->hash_suffix;
> +               hash = PyString_FromStringAndSize(newhash, 21);
> +       }
> +       return hash;
> +}
> +
> +/* get the node hash and flags of a line as a tuple */
> +static PyObject *hashflags(line *l)
> +{
> +       char *s = l->start;
> +       size_t plen = pathlen(l);
> +       PyObject *hash = nodeof(l);
> +
> +       /* 40 for hash, 1 for null byte, 1 for newline */
> +       size_t hplen = plen + 42;
> +       Py_ssize_t flen = l->len - hplen;
> +       PyObject *flags;
> +       PyObject *tup;
> +
> +       if (!hash)
> +               return NULL;
> +       flags = PyString_FromStringAndSize(s + hplen - 1, flen);
> +       if (!flags) {
> +               Py_DECREF(hash);
> +               return NULL;
> +       }
> +       tup = PyTuple_Pack(2, hash, flags);
> +       Py_DECREF(flags);
> +       Py_DECREF(hash);
> +       return tup;
> +}
> +
> +/* if we're about to run out of space in the line index, add more */
> +static bool realloc_if_full(lazymanifest *self)
> +{
> +       if (self->numlines == self->maxlines) {
> +               self->maxlines *= 2;
> +               self->lines = realloc(self->lines, self->maxlines *
> sizeof(line));
> +       }
> +       return self->lines;
> +}
> +
> +/*
> + * Find the line boundaries in the manifest that 'data' points to and
> store
> + * information about each line in 'self'.
> + */
> +static int find_lines(lazymanifest *self, char *data, Py_ssize_t len)
> +{
> +       char *prev = NULL;
> +       while (len > 0) {
> +               line *l;
> +               char *next = memchr(data, '\n', len);
> +               if (!next) {
> +                       return MANIFEST_MALFORMED;
> +               }
> +               next++; /* advance past newline */
> +               if (!realloc_if_full(self)) {
> +                       return MANIFEST_OOM; /* no memory */
> +               }
> +               if (prev && strcmp(prev, data) > -1) {
> +                       /* This data isn't sorted, so we have to abort. */
> +                       return MANIFEST_NOT_SORTED;
> +               }
> +               l = self->lines + ((self->numlines)++);
> +               l->start = data;
> +               l->len = next - data;
> +               l->hash_suffix = '\0';
> +               l->from_malloc = false;
> +               l->deleted = false;
> +               len = len - l->len;
> +               prev = data;
> +               data = next;
> +       }
> +       self->livelines = self->numlines;
> +       return 0;
> +}
> +
> +static int lazymanifest_init(lazymanifest *self, PyObject *args)
> +{
> +       char *data;
> +       Py_ssize_t len;
> +       int err, ret;
> +       PyObject *pydata;
> +       if (!PyArg_ParseTuple(args, "S", &pydata)) {
> +               return -1;
> +       }
> +       err = PyString_AsStringAndSize(pydata, &data, &len);
> +
> +       self->dirty = false;
> +       if (err == -1)
> +               return -1;
> +       self->pydata = pydata;
> +       Py_INCREF(self->pydata);
> +       Py_BEGIN_ALLOW_THREADS
> +       self->lines = malloc(DEFAULT_LINES * sizeof(line));
> +       self->maxlines = DEFAULT_LINES;
> +       self->numlines = 0;
> +       if (!self->lines)
> +               ret = -2;
>

-2 means "not sorted", no? Shouldn't this be OOM? Call PyErr_NoMemory()
here if you agree with my suggestion above.


> +       else
> +               ret = find_lines(self, data, len);
> +       Py_END_ALLOW_THREADS
> +       switch (ret) {
> +       case 0:
> +               break;
> +       case MANIFEST_OOM:
> +               PyErr_NoMemory();
> +               break;
> +       case MANIFEST_NOT_SORTED:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Manifest lines not in sorted order.");
> +               break;
> +       case MANIFEST_MALFORMED:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Manifest did not end in a newline.");
> +               break;
> +       default:
> +               PyErr_Format(PyExc_ValueError,
> +                            "Unknown problem parsing manifest.");
> +       }
> +       return ret == 0 ? 0 : -1;
> +}
> +
> +static void lazymanifest_dealloc(lazymanifest *self)
> +{
> +       /* free any extra lines we had to allocate */
> +       int i;
> +       for (i = 0; i < self->numlines; i++) {
> +               if (self->lines[i].from_malloc) {
> +                       free(self->lines[i].start);
> +               }
> +       }
> +       if (self->lines) {
> +               free(self->lines);
> +               self->lines = NULL;
> +       }
> +       if (self->pydata) {
> +               Py_DECREF(self->pydata);
> +               self->pydata = NULL;
> +       }
> +       PyObject_Del(self);
> +}
> +
> +/* iteration support */
> +
> +typedef struct {
> +       PyObject_HEAD lazymanifest *m;
> +       Py_ssize_t pos;
> +} lmIter;
> +
> +static void lmiter_dealloc(PyObject *o)
> +{
> +       lmIter *self = (lmIter *)o;
> +       Py_DECREF(self->m);
> +       PyObject_Del(self);
> +}
> +
> +static PyObject *lmiter_iternext(PyObject *o)
> +{
> +       size_t pl;
> +       line *l;
> +       Py_ssize_t consumed;
> +       PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL;
> +       lmIter *self = (lmIter *)o;
> +       do {
> +               self->pos++;
> +               if (self->pos >= self->m->numlines) {
> +                       goto bail;
> +               }
> +               /* skip over deleted manifest entries */
> +       } while (self->m->lines[self->pos].deleted);
> +       l = self->m->lines + self->pos;
> +       pl = pathlen(l);
> +       path = PyString_FromStringAndSize(l->start, pl);
> +       hash = nodeof(l);
> +       consumed = pl + 41;
> +       flags = PyString_FromStringAndSize(l->start + consumed,
> +
> l->len - consumed - 1);
> +       if (!flags) {
> +               goto bail;
> +       }
> +       ret = PyTuple_Pack(3, path, hash, flags);
> + bail:
> +       Py_XDECREF(path);
> +       Py_XDECREF(hash);
> +       Py_XDECREF(flags);
> +       return ret;
> +}
> +
> +static PyTypeObject lazymanifestIterator = {
> +       PyObject_HEAD_INIT(NULL)
> +       0,                               /*ob_size */
> +       "parsers.lazymanifest.iterator", /*tp_name */
> +       sizeof(lmIter),                  /*tp_basicsize */
> +       0,                               /*tp_itemsize */
> +       lmiter_dealloc,                  /*tp_dealloc */
> +       0,                               /*tp_print */
> +       0,                               /*tp_getattr */
> +       0,                               /*tp_setattr */
> +       0,                               /*tp_compare */
> +       0,                               /*tp_repr */
> +       0,                               /*tp_as_number */
> +       0,                               /*tp_as_sequence */
> +       0,                               /*tp_as_mapping */
> +       0,                               /*tp_hash */
> +       0,                               /*tp_call */
> +       0,                               /*tp_str */
> +       0,                               /*tp_getattro */
> +       0,                               /*tp_setattro */
> +       0,                               /*tp_as_buffer */
> +       /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
> +          use tp_iter and tp_iternext fields. */
> +       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
> +       "Iterator for a lazymanifest.",  /* tp_doc */
> +       0,                               /* tp_traverse */
> +       0,                               /* tp_clear */
> +       0,                               /* tp_richcompare */
> +       0,                               /* tp_weaklistoffset */
> +       PyObject_SelfIter,               /* tp_iter: __iter__() method */
> +       lmiter_iternext,                 /* tp_iternext: next() method */
> +};
> +
> +static lazymanifest *lazymanifest_copy(lazymanifest *self);
> +
> +static PyObject *lazymanifest_getiter(lazymanifest *self)
> +{
> +       lmIter *i = NULL;
> +       lazymanifest *t = lazymanifest_copy(self);
> +       if (!t) {
> +               PyErr_NoMemory();
> +               return NULL;
> +       }
> +       i = PyObject_New(lmIter, &lazymanifestIterator);
> +       if (i) {
> +               i->m = t;
> +               i->pos = -1;
> +       } else {
> +               Py_DECREF(t);
> +               PyErr_NoMemory();
> +       }
> +       return (PyObject *)i;
> +}
> +
> +/* __getitem__ and __setitem__ support */
> +
> +static Py_ssize_t lazymanifest_size(lazymanifest *self)
> +{
> +       return self->livelines;
> +}
> +
> +static int linecmp(const void *left, const void *right)
> +{
> +       return strcmp(((const line *)left)->start,
> +                     ((const line *)right)->start);
> +}
> +
> +static PyObject *lazymanifest_getitem(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "getitem: manifest keys must be a string.");
> +               return NULL;
> +       }
> +       needle.start = PyString_AsString(key);
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               PyErr_Format(PyExc_KeyError, "No such manifest entry.");
> +               return NULL;
> +       }
> +       return hashflags(hit);
> +}
> +
> +static int lazymanifest_delitem(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "delitem: manifest keys must be a string.");
> +               return -1;
> +       }
> +       needle.start = PyString_AsString(key);
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               PyErr_Format(PyExc_KeyError,
> +                            "Tried to delete nonexistent manifest
> entry.");
> +               return -1;
> +       }
> +       self->dirty = true;
> +       hit->deleted = true;
> +       self->livelines--;
> +       return 0;
> +}
> +
> +static int lazymanifest_setitem(
> +       lazymanifest *self, PyObject *key, PyObject *value)
> +{
> +       char *path;
> +       Py_ssize_t plen;
> +       PyObject *pyhash;
> +       Py_ssize_t hlen;
> +       char *hash;
> +       PyObject *pyflags;
> +       char *flags;
> +       Py_ssize_t flen;
> +       size_t dlen;
> +       char *dest;
> +       int i;
> +       line new;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "setitem: manifest keys must be a string.");
> +               return -1;
> +       }
> +       if (!value) {
> +               return lazymanifest_delitem(self, key);
> +       }
> +       if (!PyTuple_Check(value) || PyTuple_Size(value) != 2) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "Manifest values must be a tuple of (node,
> flags).");
> +               return -1;
> +       }
> +       if (PyString_AsStringAndSize(key, &path, &plen) == -1) {
> +               return -1;
> +       }
> +
> +       pyhash = PyTuple_GetItem(value, 0);
> +       if (!PyString_Check(pyhash)) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "node must be a 20-byte string");
> +               return -1;
> +       }
> +       hlen = PyString_Size(pyhash);
> +       /* Some parts of the codebase try and set 21 or 22
> +        * byte "hash" values in order to perturb things for
> +        * status. We have to preserve at least the 21st
> +        * byte. Sigh. If there's a 22nd byte, we drop it on
> +        * the floor, which works fine.
> +        */
> +       if (hlen != 20 && hlen != 21 && hlen != 22) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "node must be a 20-byte string");
> +               return -1;
> +       }
> +       hash = PyString_AsString(pyhash);
> +
> +       pyflags = PyTuple_GetItem(value, 1);
> +       if (!PyString_Check(pyflags) || PyString_Size(pyflags) > 1) {
> +               PyErr_Format(PyExc_TypeError,
> +                            "flags must a 0 or 1 byte string");
> +               return -1;
> +       }
> +       if (PyString_AsStringAndSize(pyflags, &flags, &flen) == -1) {
> +               return -1;
> +       }
> +       /* one null byte and one newline */
> +       dlen = plen + 41 + flen + 1;
> +       dest = malloc(dlen);
> +       if (!dest) {
> +               PyErr_NoMemory();
> +               return -1;
> +       }
> +       memcpy(dest, path, plen + 1);
> +       for (i = 0; i < 20; i++) {
> +               sprintf(dest + plen + 1 + (i * 2), "%02hhx", hash[i]);
> +       }
> +       memcpy(dest + plen + 41, flags, flen);
> +       dest[plen + 41 + flen] = '\n';
> +       new.start = dest;
> +       new.len = dlen;
> +       new.hash_suffix = '\0';
> +       if (hlen > 20) {
> +               new.hash_suffix = hash[20];
> +       }
> +       new.from_malloc = true;     /* is `start` a pointer we allocated?
> */
> +       new.deleted = false;        /* is this entry deleted? */
> +       hit = bsearch(&new, self->lines, self->numlines,
> +                     sizeof(line), &linecmp);
> +       self->dirty = true;
> +       if (hit) {
> +               /* updating a line we already had */
> +               if (hit->from_malloc) {
> +                       free(hit->start);
> +               }
> +               if (hit->deleted) {
> +                       self->livelines++;
> +               }
> +               *hit = new;
> +       } else {
> +               /* new line */
> +               if (!realloc_if_full(self)) {
> +                       PyErr_NoMemory();
> +                       return -1;
> +               }
> +               self->lines[self->numlines++] = new;
> +               self->livelines++;
> +               /* TODO: do a binary search and insert rather than
> +                * append and qsort. Also offer a batch-insert
> +                * interface, which should be a nice little
> +                * performance win.
> +                */
> +               qsort(self->lines, self->numlines, sizeof(line), &linecmp);
> +       }
> +       return 0;
> +}
> +
> +static PyMappingMethods lazymanifest_mapping_methods = {
> +       (lenfunc)lazymanifest_size,             /* mp_length */
> +       (binaryfunc)lazymanifest_getitem,       /* mp_subscript */
> +       (objobjargproc)lazymanifest_setitem,    /* mp_ass_subscript */
> +};
> +
> +/* sequence methods (important or __contains__ builds an iterator */
> +
> +static int lazymanifest_contains(lazymanifest *self, PyObject *key)
> +{
> +       line needle;
> +       line *hit;
> +       if (!PyString_Check(key)) {
> +               /* Our keys are always strings, so if the contains
> +                * check is for a non-string, just return false. */
> +               return 0;
> +       }
> +       needle.start = PyString_AsString(key);
> +       hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
> +                     &linecmp);
> +       if (!hit || hit->deleted) {
> +               return 0;
> +       }
> +       return 1;
> +}
> +
> +static PySequenceMethods lazymanifest_seq_meths = {
> +       (lenfunc)lazymanifest_size, /* sq_length */
> +       0, /* sq_concat */
> +       0, /* sq_repeat */
> +       0, /* sq_item */
> +       0, /* sq_slice */
> +       0, /* sq_ass_item */
> +       0, /* sq_ass_slice */
> +       (objobjproc)lazymanifest_contains, /* sq_contains */
> +       0, /* sq_inplace_concat */
> +       0, /* sq_inplace_repeat */
> +};
> +
> +
> +/* Other methods (copy, diff, etc) */
> +static PyTypeObject lazymanifestType;
> +
> +/* If the manifest has changes, build the new manifest text and reindex
> it. */
> +static int compact(lazymanifest *self) {
> +       int i;
> +       ssize_t need = 0;
> +       char *data;
> +       line *src, *dst;
> +       PyObject *pydata;
> +       if (!self->dirty)
> +               return 0;
> +       for (i = 0; i < self->numlines; i++) {
> +               if (!self->lines[i].deleted) {
> +                       need += self->lines[i].len;
> +               }
> +       }
> +       pydata = PyString_FromStringAndSize(NULL, need);
> +       if (!pydata)
> +               return -1;
> +       data = PyString_AsString(pydata);
> +       if (!data) {
> +               return -1;
> +       }
> +       src = self->lines;
> +       dst = self->lines;
> +       for (i = 0; i < self->numlines; i++, src++) {
> +               char *tofree = NULL;
> +               if (src->from_malloc) {
> +                       tofree = src->start;
> +               }
> +               if (!src->deleted) {
> +                       memcpy(data, src->start, src->len);
> +                       *dst = *src;
> +                       dst->start = data;
> +                       dst->from_malloc = false;
> +                       data += dst->len;
> +                       dst++;
> +               }
> +               free(tofree);
> +       }
> +       Py_DECREF(self->pydata);
> +       self->pydata = pydata;
> +       self->numlines = self->livelines;
> +       self->dirty = false;
> +       return 0;
> +}
> +
> +static PyObject *lazymanifest_text(lazymanifest *self)
> +{
> +       if (compact(self) != 0) {
> +               PyErr_NoMemory();
> +               return NULL;
> +       }
> +       Py_INCREF(self->pydata);
> +       return self->pydata;
> +}
> +
> +static lazymanifest *lazymanifest_copy(lazymanifest *self)
> +{
> +       lazymanifest *copy = NULL;
> +       if (compact(self) != 0) {
> +               goto nomem;
> +       }
> +       copy = PyObject_New(lazymanifest, &lazymanifestType);
> +       if (!copy) {
> +               goto nomem;
> +       }
> +       copy->numlines = self->numlines;
> +       copy->livelines = self->livelines;
> +       copy->dirty = false;
> +       copy->lines = malloc(self->maxlines *sizeof(line));
> +       if (!copy->lines) {
> +               goto nomem;
> +       }
> +       memcpy(copy->lines, self->lines, self->numlines * sizeof(line));
> +       copy->maxlines = self->maxlines;
> +       copy->pydata = self->pydata;
> +       Py_INCREF(copy->pydata);
> +       return copy;
> + nomem:
> +       PyErr_NoMemory();
> +       Py_XDECREF(copy);
> +       return NULL;
> +}
> +
> +static lazymanifest *lazymanifest_filtercopy(
> +       lazymanifest *self, PyObject *matchfn)
> +{
> +       lazymanifest *copy = NULL;
> +       int i;
> +       if (!PyCallable_Check(matchfn)) {
> +               PyErr_SetString(PyExc_TypeError, "matchfn must be
> callable");
> +               return NULL;
> +       }
> +       /* compact ourselves first to avoid double-frees later when we
> +        * compact tmp so that it doesn't have random pointers to our
> +        * underlying from_malloc-data (self->pydata is safe) */
> +       if (compact(self) != 0) {
> +               goto nomem;
> +       }
> +       copy = PyObject_New(lazymanifest, &lazymanifestType);
> +       copy->dirty = true;
> +       copy->lines = malloc(self->maxlines * sizeof(line));
> +       if (!copy->lines) {
> +               goto nomem;
> +       }
> +       copy->maxlines = self->maxlines;
> +       copy->numlines = 0;
> +       copy->pydata = self->pydata;
> +       Py_INCREF(self->pydata);
> +       for (i = 0; i < self->numlines; i++) {
> +               PyObject *arg = PyString_FromString(self->lines[i].start);
> +               PyObject *arglist = PyTuple_Pack(1, arg);
> +               PyObject *result = PyObject_CallObject(matchfn, arglist);
> +               Py_DECREF(arglist);
> +               Py_DECREF(arg);
> +               /* if the callback raised an exception, just let it
> +                * through and give up */
> +               if (!result) {
> +                       free(copy->lines);
> +                       Py_DECREF(self->pydata);
> +                       return NULL;
> +               }
> +               if (PyObject_IsTrue(result)) {
> +                       assert(!(self->lines[i].from_malloc));
> +                       copy->lines[copy->numlines++] = self->lines[i];
> +               }
> +               Py_DECREF(result);
> +       }
> +       copy->livelines = copy->numlines;
> +       return copy;
> + nomem:
> +       PyErr_NoMemory();
> +       Py_XDECREF(copy);
> +       return NULL;
> +}
> +
> +static PyObject *lazymanifest_diff(lazymanifest *self, PyObject *args)
> +{
> +       lazymanifest *other;
> +       PyObject *pyclean = NULL;
> +       bool listclean;
> +       PyObject *emptyTup = NULL, *ret = NULL;
> +       PyObject *es;
> +       int sneedle = 0, oneedle = 0;
> +       if (!PyArg_ParseTuple(args, "O!|O", &lazymanifestType, &other,
> &pyclean)) {
> +               return NULL;
> +       }
> +       listclean = (!pyclean) ? false : PyObject_IsTrue(pyclean);
> +       es = PyString_FromString("");
> +       if (!es) {
> +               goto nomem;
> +       }
> +       emptyTup = PyTuple_Pack(2, Py_None, es);
> +       Py_DECREF(es);
> +       if (!emptyTup) {
> +               goto nomem;
> +       }
> +       ret = PyDict_New();
> +       if (!ret) {
> +               goto nomem;
> +       }
> +       while (sneedle != self->numlines || oneedle != other->numlines) {
> +               line *left = self->lines + sneedle;
> +               line *right = other->lines + oneedle;
> +               int result;
> +               PyObject *key;
> +               PyObject *outer;
> +               /* If we're looking at a deleted entry and it's not
> +                * the end of the manifest, just skip it. */
> +               if (left->deleted && sneedle < self->numlines) {
> +                       sneedle++;
> +                       continue;
> +               }
> +               if (right->deleted && oneedle < other->numlines) {
> +                       oneedle++;
> +                       continue;
> +               }
> +               /* if we're at the end of either manifest, then we
> +                * know the remaining items are adds so we can skip
> +                * the strcmp. */
> +               if (sneedle == self->numlines) {
> +                       result = 1;
> +               } else if (oneedle == other->numlines) {
> +                       result = -1;
> +               } else {
> +                       result = linecmp(left, right);
> +               }
> +               key = result <= 0 ?
> +                       PyString_FromString(left->start) :
> +                       PyString_FromString(right->start);
> +               if (!key)
> +                       goto nomem;
> +               if (result < 0) {
> +                       PyObject *l = hashflags(left);
> +                       if (!l) {
> +                               goto nomem;
> +                       }
> +                       outer = PyTuple_Pack(2, l, emptyTup);
> +                       Py_DECREF(l);
> +                       if (!outer) {
> +                               goto nomem;
> +                       }
> +                       PyDict_SetItem(ret, key, outer);
> +                       Py_DECREF(outer);
> +                       sneedle++;
> +               } else if (result > 0) {
> +                       PyObject *r = hashflags(right);
> +                       if (!r) {
> +                               goto nomem;
> +                       }
> +                       outer = PyTuple_Pack(2, emptyTup, r);
> +                       Py_DECREF(r);
> +                       if (!outer) {
> +                               goto nomem;
> +                       }
> +                       PyDict_SetItem(ret, key, outer);
> +                       Py_DECREF(outer);
> +                       oneedle++;
> +               } else {
> +                       /* file exists in both manifests */
> +                       if (left->len != right->len
> +                           || memcmp(left->start, right->start, left->len)
> +                           || left->hash_suffix != right->hash_suffix) {
> +                               PyObject *l = hashflags(left);
> +                               PyObject *r;
> +                               if (!l) {
> +                                       goto nomem;
> +                               }
> +                               r = hashflags(right);
> +                               if (!r) {
> +                                       Py_DECREF(l);
> +                                       goto nomem;
> +                               }
> +                               outer = PyTuple_Pack(2, l, r);
> +                               Py_DECREF(l);
> +                               Py_DECREF(r);
> +                               if (!outer) {
> +                                       goto nomem;
> +                               }
> +                               PyDict_SetItem(ret, key, outer);
> +                               Py_DECREF(outer);
> +                       } else if (listclean) {
> +                               PyDict_SetItem(ret, key, Py_None);
> +                       }
> +                       sneedle++;
> +                       oneedle++;
> +               }
> +               Py_DECREF(key);
> +       }
> +       Py_DECREF(emptyTup);
> +       return ret;
> + nomem:
> +       PyErr_NoMemory();
> +       Py_XDECREF(ret);
> +       Py_XDECREF(emptyTup);
> +       return NULL;
> +}
> +
> +static PyMethodDef lazymanifest_methods[] = {
> +       {"copy", (PyCFunction)lazymanifest_copy, METH_NOARGS,
> +        "Make a copy of this lazymanifest."},
> +       {"filtercopy", (PyCFunction)lazymanifest_filtercopy, METH_O,
> +        "Make a copy of this manifest filtered by matchfn."},
> +       {"diff", (PyCFunction)lazymanifest_diff, METH_VARARGS,
> +        "Compare this lazymanifest to another one."},
> +       {"text", (PyCFunction)lazymanifest_text, METH_NOARGS,
> +        "Encode this manifest to text."},
> +       {NULL},
> +};
> +
> +static PyTypeObject lazymanifestType = {
> +       PyObject_HEAD_INIT(NULL)
> +       0,                                                /* ob_size */
> +       "parsers.lazymanifest",                           /* tp_name */
> +       sizeof(lazymanifest),                             /* tp_basicsize
> */
> +       0,                                                /* tp_itemsize */
> +       (destructor)lazymanifest_dealloc,                 /* tp_dealloc */
> +       0,                                                /* tp_print */
> +       0,                                                /* tp_getattr */
> +       0,                                                /* tp_setattr */
> +       0,                                                /* tp_compare */
> +       0,                                                /* tp_repr */
> +       0,                                                /* tp_as_number
> */
> +       &lazymanifest_seq_meths,                          /*
> tp_as_sequence */
> +       &lazymanifest_mapping_methods,                    /* tp_as_mapping
> */
> +       0,                                                /* tp_hash */
> +       0,                                                /* tp_call */
> +       0,                                                /* tp_str */
> +       0,                                                /* tp_getattro */
> +       0,                                                /* tp_setattro */
> +       0,                                                /* tp_as_buffer
> */
> +       Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_SEQUENCE_IN, /* tp_flags */
> +       "TODO(augie)",                                    /* tp_doc */
> +       0,                                                /* tp_traverse */
> +       0,                                                /* tp_clear */
> +       0,                                                /*
> tp_richcompare */
> +       0,                                             /*
> tp_weaklistoffset */
> +       (getiterfunc)lazymanifest_getiter,                /* tp_iter */
> +       0,                                                /* tp_iternext */
> +       lazymanifest_methods,                             /* tp_methods */
> +       0,                                                /* tp_members */
> +       0,                                                /* tp_getset */
> +       0,                                                /* tp_base */
> +       0,                                                /* tp_dict */
> +       0,                                                /* tp_descr_get
> */
> +       0,                                                /* tp_descr_set
> */
> +       0,                                                /* tp_dictoffset
> */
> +       (initproc)lazymanifest_init,                      /* tp_init */
> +       0,                                                /* tp_alloc */
> +};
> +
> +void manifest_module_init(PyObject * mod)
> +{
> +       lazymanifestType.tp_new = PyType_GenericNew;
> +       if (PyType_Ready(&lazymanifestType) < 0)
> +               return;
> +       Py_INCREF(&lazymanifestType);
> +
> +       PyModule_AddObject(mod, "lazymanifest",
> +                          (PyObject *)&lazymanifestType);
> +}
> diff --git a/mercurial/parsers.c b/mercurial/parsers.c
> --- a/mercurial/parsers.c
> +++ b/mercurial/parsers.c
> @@ -71,7 +71,7 @@ static inline int hexdigit(const char *p
>  /*
>   * Turn a hex-encoded string into binary.
>   */
> -static PyObject *unhexlify(const char *str, int len)
> +PyObject *unhexlify(const char *str, int len)
>  {
>         PyObject *ret;
>         char *d;
> @@ -2318,6 +2318,7 @@ static PyMethodDef methods[] = {
>  };
>
>  void dirs_module_init(PyObject *mod);
> +void manifest_module_init(PyObject *mod);
>
>  static void module_init(PyObject *mod)
>  {
> @@ -2332,6 +2333,7 @@ static void module_init(PyObject *mod)
>         PyModule_AddStringConstant(mod, "versionerrortext",
> versionerrortext);
>
>         dirs_module_init(mod);
> +       manifest_module_init(mod);
>
>         indexType.tp_new = PyType_GenericNew;
>         if (PyType_Ready(&indexType) < 0 ||
> diff --git a/setup.py b/setup.py
> --- a/setup.py
> +++ b/setup.py
> @@ -493,6 +493,7 @@ extmodules = [
>      Extension('mercurial.mpatch', ['mercurial/mpatch.c'],
>                depends=common_depends),
>      Extension('mercurial.parsers', ['mercurial/dirs.c',
> +                                    'mercurial/manifest.c',
>                                      'mercurial/parsers.c',
>                                      'mercurial/pathencode.c'],
>                depends=common_depends),
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-manifest.py
> @@ -0,0 +1,221 @@
> +import binascii
> +import unittest
> +import itertools
> +
> +import silenttestrunner
> +
> +from mercurial import parsers
> +
> +HASH_1 = '1' * 40
> +HASH_2 = 'f' * 40
> +HASH_3 = '1234567890abcdef0987654321deadbeef0fcafe'
> +A_SHORT_MANIFEST = (
> +    'bar/baz/qux.py\0%(hash2)s%(flag2)s\n'
> +    'foo\0%(hash1)s%(flag1)s\n'
> +    ) % {'hash1': HASH_1,
> +         'flag1': '',
> +         'hash2': HASH_2,
> +         'flag2': 'l',
> +         }
> +
> +HUGE_MANIFEST_ENTRIES = 200001
> +
> +A_HUGE_MANIFEST = ''.join(sorted(
> +    'file%d\0%s%s\n' % (i, h, f) for i, h, f in
> +    itertools.izip(xrange(200001),
> +                   itertools.cycle((HASH_1, HASH_2)),
> +                   itertools.cycle(('', 'x', 'l')))))
> +
> +class testmanifest(unittest.TestCase):
> +
> +    def assertIn(self, thing, container, msg=None):
> +        # assertIn new in 2.7, use it if available, otherwise polyfill
> +        sup = getattr(unittest.TestCase, 'assertIn', False)
> +        if sup:
> +            return sup(self, thing, container, msg=msg)
> +        if not msg:
> +            msg = 'Expected %r in %r' % (thing, container)
> +        self.assert_(thing in container, msg)
> +
> +    def testEmptyManifest(self):
> +        m = parsers.lazymanifest('')
> +        self.assertEqual(0, len(m))
> +        self.assertEqual([], list(m))
> +
> +    def testManifest(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        want = [
> +            ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
> +            ('foo', binascii.unhexlify(HASH_1), ''),
> +            ]
> +        self.assertEqual(len(want), len(m))
> +        self.assertEqual(want, list(m))
> +        self.assertEqual((binascii.unhexlify(HASH_1), ''), m['foo'])
> +        self.assertRaises(KeyError, lambda : m['wat'])
> +        self.assertEqual((binascii.unhexlify(HASH_2), 'l'),
> +                         m['bar/baz/qux.py'])
> +
> +    def testSetItem(self):
> +        want = binascii.unhexlify(HASH_1), ''
> +
> +        m = parsers.lazymanifest('')
> +        m['a'] = want
> +        self.assertIn('a', m)
> +        self.assertEqual(want, m['a'])
> +        self.assertEqual('a\0' + HASH_1 + '\n', m.text())
> +
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m['a'] = want
> +        self.assertEqual(want, m['a'])
> +        self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
> +                         m.text())
> +        m2 = m.copy()
> +        del m
> +        del m2 # make sure we don't double free() anything
> +
> +    def testCompaction(self):
> +        unhex = binascii.unhexlify
> +        h1, h2 = unhex(HASH_1), unhex(HASH_2)
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m['alpha'] = h1, ''
> +        m['beta'] = h2, ''
> +        del m['foo']
> +        want = 'alpha\0%s\nbar/baz/qux.py\0%sl\nbeta\0%s\n' % (
> +            HASH_1, HASH_2, HASH_2)
> +        self.assertEqual(want, m.text())
> +        self.assertEqual(3, len(m))
> +        self.assertEqual((h1, ''), m['alpha'])
> +        self.assertEqual((h2, ''), m['beta'])
> +        self.assertRaises(KeyError, lambda : m['foo'])
> +        w = [('alpha', h1, ''), ('bar/baz/qux.py', h2, 'l'), ('beta', h2,
> '')]
> +        self.assertEqual(w, list(m))
> +
> +    def testSetGetNodeSuffix(self):
> +        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        h, f = m['foo']
> +        want = h + 'a', f
> +        # Merge code wants to set 21-byte fake hashes at times
> +        m['foo'] = want
> +        self.assertEqual(want, m['foo'])
> +        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2),
> 'l'),
> +                          ('foo', binascii.unhexlify(HASH_1) + 'a', '')],
> +                         list(m))
> +        # Sometimes it even tries a 22-byte fake hash, but we can
> +        # return 21 and it'll work out
> +        m['foo'] = want[0] + '+', f
> +        self.assertEqual(want, m['foo'])
> +        # make sure the suffix survives a copy
> +        m2 = m.filtercopy(lambda x: x == 'foo')
> +        self.assertEqual(want, m2['foo'])
> +        self.assertEqual(1, len(m2))
> +        self.assertEqual(('foo\0%s\n' % HASH_1), m2.text())
> +        m2 = m.copy()
> +        self.assertEqual(want, m2['foo'])
> +        # suffix with iteration
> +        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2),
> 'l'),
> +                          ('foo', want[0], '')], list(m))
> +        # shows up in diff
> +        self.assertEqual({'foo': (want, (h, ''))}, m.diff(clean))
> +        self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
> +
> +    def testFilterCopyException(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        def filt(path):
> +            if path == 'foo':
> +                assert False
> +            return True
> +        self.assertRaises(AssertionError, m.filtercopy, filt)
> +
> +    def testRemoveItem(self):
> +        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        del m['foo']
> +        self.assertRaises(KeyError, lambda : m['foo'])
> +        self.assertEqual(1, len(m))
> +        self.assertEqual(1, len(list(m)))
> +
> +    def testManifestDiff(self):
> +        MISSING = (None, '')
> +        addl = 'z-only-in-left\0' + HASH_1 + '\n'
> +        addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
> +        left = parsers.lazymanifest(
> +            A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
> +        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
> +        want = {
> +            'foo': ((binascii.unhexlify(HASH_3), 'x'),
> +                    (binascii.unhexlify(HASH_1), '')),
> +            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            'z-only-in-right': (MISSING, (binascii.unhexlify(HASH_2),
> 'x')),
> +            }
> +        self.assertEqual(want, left.diff(right))
> +
> +        want = {
> +            'bar/baz/qux.py': (MISSING, (binascii.unhexlify(HASH_2),
> 'l')),
> +            'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
> +            'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, parsers.lazymanifest('').diff(left))
> +
> +        want = {
> +            'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'),
> MISSING),
> +            'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
> +            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            }
> +        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
> +        copy = right.copy()
> +        del copy['z-only-in-right']
> +        del right['foo']
> +        want = {
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            'z-only-in-right': ((binascii.unhexlify(HASH_2), 'x'),
> MISSING),
> +            }
> +        self.assertEqual(want, right.diff(copy))
> +
> +        short = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        pruned = short.copy()
> +        del pruned['foo']
> +        want = {
> +            'foo': ((binascii.unhexlify(HASH_1), ''), MISSING),
> +            }
> +        self.assertEqual(want, short.diff(pruned))
> +        want = {
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, pruned.diff(short))
> +        want = {
> +            'bar/baz/qux.py': None,
> +            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
> +            }
> +        self.assertEqual(want, pruned.diff(short, True))
> +
> +    def testReversedLines(self):
> +        backwards = ''.join(
> +            l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if
> l)
> +        try:
> +            parsers.lazymanifest(backwards)
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertIn('Manifest lines not in sorted order.', str(v))
> +
> +    def testNoTerminalNewline(self):
> +        try:
> +            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertIn('Manifest did not end in a newline.', str(v))
> +
> +    def testNoNewLineAtAll(self):
> +        try:
> +            parsers.lazymanifest('wat')
> +            self.fail('Should have raised ValueError')
> +        except ValueError, v:
> +            self.assertIn('Manifest did not end in a newline.', str(v))
> +
> +    def testHugeManifest(self):
> +        m = parsers.lazymanifest(A_HUGE_MANIFEST)
> +        self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
> +        self.assertEqual(len(m), len(list(m)))
> +
> +
> +if __name__ == '__main__':
> +    silenttestrunner.main(__name__)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>

Patch

diff --git a/mercurial/manifest.c b/mercurial/manifest.c
new file mode 100644
--- /dev/null
+++ b/mercurial/manifest.c
@@ -0,0 +1,841 @@ 
+/*
+ * manifest.c - manifest type that does on-demand parsing.
+ *
+ * Copyright 2015, Google Inc.
+ *
+ * This software may be used and distributed according to the terms of
+ * the GNU General Public License, incorporated herein by reference.
+ */
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <Python.h>
+
+/* VC9 doesn't include bool and lacks stdbool.h based on my searching */
+#ifdef _MSC_VER
+#define true 1
+#define false 0
+typedef unsigned char bool;
+#else
+#include <stdbool.h>
+#endif
+
+#define DEFAULT_LINES 100000
+
+typedef struct {
+	char *start;
+	Py_ssize_t len; /* length of line including terminal newline */
+	char hash_suffix;
+	bool from_malloc;
+	bool deleted;
+} line;
+
+typedef struct {
+	PyObject_HEAD
+	PyObject *pydata;
+	line *lines;
+	int numlines; /* number of line entries */
+	int livelines; /* number of non-deleted lines */
+	int maxlines; /* allocated number of lines */
+	bool dirty;
+} lazymanifest;
+
+#define MANIFEST_OOM -1
+#define MANIFEST_NOT_SORTED -2
+#define MANIFEST_MALFORMED -3
+
+/* defined in parsers.c */
+PyObject *unhexlify(const char *str, int len);
+
+/* get the length of the path for a line */
+static size_t pathlen(line *l) {
+	return strlen(l->start);
+}
+
+/* get the node value of a single line */
+static PyObject *nodeof(line *l) {
+	char *s = l->start;
+	ssize_t llen = pathlen(l);
+	PyObject *hash = unhexlify(s + llen + 1, 40);
+	if (!hash) {
+		return NULL;
+	}
+	if (l->hash_suffix != '\0') {
+		char newhash[21];
+		memcpy(newhash, PyString_AsString(hash), 20);
+		Py_DECREF(hash);
+		newhash[20] = l->hash_suffix;
+		hash = PyString_FromStringAndSize(newhash, 21);
+	}
+	return hash;
+}
+
+/* get the node hash and flags of a line as a tuple */
+static PyObject *hashflags(line *l)
+{
+	char *s = l->start;
+	size_t plen = pathlen(l);
+	PyObject *hash = nodeof(l);
+
+	/* 40 for hash, 1 for null byte, 1 for newline */
+	size_t hplen = plen + 42;
+	Py_ssize_t flen = l->len - hplen;
+	PyObject *flags;
+	PyObject *tup;
+
+	if (!hash)
+		return NULL;
+	flags = PyString_FromStringAndSize(s + hplen - 1, flen);
+	if (!flags) {
+		Py_DECREF(hash);
+		return NULL;
+	}
+	tup = PyTuple_Pack(2, hash, flags);
+	Py_DECREF(flags);
+	Py_DECREF(hash);
+	return tup;
+}
+
+/* if we're about to run out of space in the line index, add more */
+static bool realloc_if_full(lazymanifest *self)
+{
+	if (self->numlines == self->maxlines) {
+		self->maxlines *= 2;
+		self->lines = realloc(self->lines, self->maxlines * sizeof(line));
+	}
+	return self->lines;
+}
+
+/*
+ * Find the line boundaries in the manifest that 'data' points to and store
+ * information about each line in 'self'.
+ */
+static int find_lines(lazymanifest *self, char *data, Py_ssize_t len)
+{
+	char *prev = NULL;
+	while (len > 0) {
+		line *l;
+		char *next = memchr(data, '\n', len);
+		if (!next) {
+			return MANIFEST_MALFORMED;
+		}
+		next++; /* advance past newline */
+		if (!realloc_if_full(self)) {
+			return MANIFEST_OOM; /* no memory */
+		}
+		if (prev && strcmp(prev, data) > -1) {
+			/* This data isn't sorted, so we have to abort. */
+			return MANIFEST_NOT_SORTED;
+		}
+		l = self->lines + ((self->numlines)++);
+		l->start = data;
+		l->len = next - data;
+		l->hash_suffix = '\0';
+		l->from_malloc = false;
+		l->deleted = false;
+		len = len - l->len;
+		prev = data;
+		data = next;
+	}
+	self->livelines = self->numlines;
+	return 0;
+}
+
+static int lazymanifest_init(lazymanifest *self, PyObject *args)
+{
+	char *data;
+	Py_ssize_t len;
+	int err, ret;
+	PyObject *pydata;
+	if (!PyArg_ParseTuple(args, "S", &pydata)) {
+		return -1;
+	}
+	err = PyString_AsStringAndSize(pydata, &data, &len);
+
+	self->dirty = false;
+	if (err == -1)
+		return -1;
+	self->pydata = pydata;
+	Py_INCREF(self->pydata);
+	Py_BEGIN_ALLOW_THREADS
+	self->lines = malloc(DEFAULT_LINES * sizeof(line));
+	self->maxlines = DEFAULT_LINES;
+	self->numlines = 0;
+	if (!self->lines)
+		ret = -2;
+	else
+		ret = find_lines(self, data, len);
+	Py_END_ALLOW_THREADS
+	switch (ret) {
+	case 0:
+		break;
+	case MANIFEST_OOM:
+		PyErr_NoMemory();
+		break;
+	case MANIFEST_NOT_SORTED:
+		PyErr_Format(PyExc_ValueError,
+			     "Manifest lines not in sorted order.");
+		break;
+	case MANIFEST_MALFORMED:
+		PyErr_Format(PyExc_ValueError,
+			     "Manifest did not end in a newline.");
+		break;
+	default:
+		PyErr_Format(PyExc_ValueError,
+			     "Unknown problem parsing manifest.");
+	}
+	return ret == 0 ? 0 : -1;
+}
+
+static void lazymanifest_dealloc(lazymanifest *self)
+{
+	/* free any extra lines we had to allocate */
+	int i;
+	for (i = 0; i < self->numlines; i++) {
+		if (self->lines[i].from_malloc) {
+			free(self->lines[i].start);
+		}
+	}
+	if (self->lines) {
+		free(self->lines);
+		self->lines = NULL;
+	}
+	if (self->pydata) {
+		Py_DECREF(self->pydata);
+		self->pydata = NULL;
+	}
+	PyObject_Del(self);
+}
+
+/* iteration support */
+
+typedef struct {
+	PyObject_HEAD lazymanifest *m;
+	Py_ssize_t pos;
+} lmIter;
+
+static void lmiter_dealloc(PyObject *o)
+{
+	lmIter *self = (lmIter *)o;
+	Py_DECREF(self->m);
+	PyObject_Del(self);
+}
+
+static PyObject *lmiter_iternext(PyObject *o)
+{
+	size_t pl;
+	line *l;
+	Py_ssize_t consumed;
+	PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL;
+	lmIter *self = (lmIter *)o;
+	do {
+		self->pos++;
+		if (self->pos >= self->m->numlines) {
+			goto bail;
+		}
+		/* skip over deleted manifest entries */
+	} while (self->m->lines[self->pos].deleted);
+	l = self->m->lines + self->pos;
+	pl = pathlen(l);
+	path = PyString_FromStringAndSize(l->start, pl);
+	hash = nodeof(l);
+	consumed = pl + 41;
+	flags = PyString_FromStringAndSize(l->start + consumed,
+									   l->len - consumed - 1);
+	if (!flags) {
+		goto bail;
+	}
+	ret = PyTuple_Pack(3, path, hash, flags);
+ bail:
+	Py_XDECREF(path);
+	Py_XDECREF(hash);
+	Py_XDECREF(flags);
+	return ret;
+}
+
+static PyTypeObject lazymanifestIterator = {
+	PyObject_HEAD_INIT(NULL)
+	0,                               /*ob_size */
+	"parsers.lazymanifest.iterator", /*tp_name */
+	sizeof(lmIter),                  /*tp_basicsize */
+	0,                               /*tp_itemsize */
+	lmiter_dealloc,                  /*tp_dealloc */
+	0,                               /*tp_print */
+	0,                               /*tp_getattr */
+	0,                               /*tp_setattr */
+	0,                               /*tp_compare */
+	0,                               /*tp_repr */
+	0,                               /*tp_as_number */
+	0,                               /*tp_as_sequence */
+	0,                               /*tp_as_mapping */
+	0,                               /*tp_hash */
+	0,                               /*tp_call */
+	0,                               /*tp_str */
+	0,                               /*tp_getattro */
+	0,                               /*tp_setattro */
+	0,                               /*tp_as_buffer */
+	/* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to
+	   use tp_iter and tp_iternext fields. */
+	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER,
+	"Iterator for a lazymanifest.",  /* tp_doc */
+	0,                               /* tp_traverse */
+	0,                               /* tp_clear */
+	0,                               /* tp_richcompare */
+	0,                               /* tp_weaklistoffset */
+	PyObject_SelfIter,               /* tp_iter: __iter__() method */
+	lmiter_iternext,                 /* tp_iternext: next() method */
+};
+
+static lazymanifest *lazymanifest_copy(lazymanifest *self);
+
+static PyObject *lazymanifest_getiter(lazymanifest *self)
+{
+	lmIter *i = NULL;
+	lazymanifest *t = lazymanifest_copy(self);
+	if (!t) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+	i = PyObject_New(lmIter, &lazymanifestIterator);
+	if (i) {
+		i->m = t;
+		i->pos = -1;
+	} else {
+		Py_DECREF(t);
+		PyErr_NoMemory();
+	}
+	return (PyObject *)i;
+}
+
+/* __getitem__ and __setitem__ support */
+
+static Py_ssize_t lazymanifest_size(lazymanifest *self)
+{
+	return self->livelines;
+}
+
+static int linecmp(const void *left, const void *right)
+{
+	return strcmp(((const line *)left)->start,
+		      ((const line *)right)->start);
+}
+
+static PyObject *lazymanifest_getitem(lazymanifest *self, PyObject *key)
+{
+	line needle;
+	line *hit;
+	if (!PyString_Check(key)) {
+		PyErr_Format(PyExc_TypeError,
+			     "getitem: manifest keys must be a string.");
+		return NULL;
+	}
+	needle.start = PyString_AsString(key);
+	hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
+		      &linecmp);
+	if (!hit || hit->deleted) {
+		PyErr_Format(PyExc_KeyError, "No such manifest entry.");
+		return NULL;
+	}
+	return hashflags(hit);
+}
+
+static int lazymanifest_delitem(lazymanifest *self, PyObject *key)
+{
+	line needle;
+	line *hit;
+	if (!PyString_Check(key)) {
+		PyErr_Format(PyExc_TypeError,
+			     "delitem: manifest keys must be a string.");
+		return -1;
+	}
+	needle.start = PyString_AsString(key);
+	hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
+		      &linecmp);
+	if (!hit || hit->deleted) {
+		PyErr_Format(PyExc_KeyError,
+			     "Tried to delete nonexistent manifest entry.");
+		return -1;
+	}
+	self->dirty = true;
+	hit->deleted = true;
+	self->livelines--;
+	return 0;
+}
+
+static int lazymanifest_setitem(
+	lazymanifest *self, PyObject *key, PyObject *value)
+{
+	char *path;
+	Py_ssize_t plen;
+	PyObject *pyhash;
+	Py_ssize_t hlen;
+	char *hash;
+	PyObject *pyflags;
+	char *flags;
+	Py_ssize_t flen;
+	size_t dlen;
+	char *dest;
+	int i;
+	line new;
+	line *hit;
+	if (!PyString_Check(key)) {
+		PyErr_Format(PyExc_TypeError,
+			     "setitem: manifest keys must be a string.");
+		return -1;
+	}
+	if (!value) {
+		return lazymanifest_delitem(self, key);
+	}
+	if (!PyTuple_Check(value) || PyTuple_Size(value) != 2) {
+		PyErr_Format(PyExc_TypeError,
+			     "Manifest values must be a tuple of (node, flags).");
+		return -1;
+	}
+	if (PyString_AsStringAndSize(key, &path, &plen) == -1) {
+		return -1;
+	}
+
+	pyhash = PyTuple_GetItem(value, 0);
+	if (!PyString_Check(pyhash)) {
+		PyErr_Format(PyExc_TypeError,
+			     "node must be a 20-byte string");
+		return -1;
+	}
+	hlen = PyString_Size(pyhash);
+	/* Some parts of the codebase try and set 21 or 22
+	 * byte "hash" values in order to perturb things for
+	 * status. We have to preserve at least the 21st
+	 * byte. Sigh. If there's a 22nd byte, we drop it on
+	 * the floor, which works fine.
+	 */
+	if (hlen != 20 && hlen != 21 && hlen != 22) {
+		PyErr_Format(PyExc_TypeError,
+			     "node must be a 20-byte string");
+		return -1;
+	}
+	hash = PyString_AsString(pyhash);
+
+	pyflags = PyTuple_GetItem(value, 1);
+	if (!PyString_Check(pyflags) || PyString_Size(pyflags) > 1) {
+		PyErr_Format(PyExc_TypeError,
+			     "flags must a 0 or 1 byte string");
+		return -1;
+	}
+	if (PyString_AsStringAndSize(pyflags, &flags, &flen) == -1) {
+		return -1;
+	}
+	/* one null byte and one newline */
+	dlen = plen + 41 + flen + 1;
+	dest = malloc(dlen);
+	if (!dest) {
+		PyErr_NoMemory();
+		return -1;
+	}
+	memcpy(dest, path, plen + 1);
+	for (i = 0; i < 20; i++) {
+		sprintf(dest + plen + 1 + (i * 2), "%02hhx", hash[i]);
+	}
+	memcpy(dest + plen + 41, flags, flen);
+	dest[plen + 41 + flen] = '\n';
+	new.start = dest;
+	new.len = dlen;
+	new.hash_suffix = '\0';
+	if (hlen > 20) {
+		new.hash_suffix = hash[20];
+	}
+	new.from_malloc = true;     /* is `start` a pointer we allocated? */
+	new.deleted = false;        /* is this entry deleted? */
+	hit = bsearch(&new, self->lines, self->numlines,
+		      sizeof(line), &linecmp);
+	self->dirty = true;
+	if (hit) {
+		/* updating a line we already had */
+		if (hit->from_malloc) {
+			free(hit->start);
+		}
+		if (hit->deleted) {
+			self->livelines++;
+		}
+		*hit = new;
+	} else {
+		/* new line */
+		if (!realloc_if_full(self)) {
+			PyErr_NoMemory();
+			return -1;
+		}
+		self->lines[self->numlines++] = new;
+		self->livelines++;
+		/* TODO: do a binary search and insert rather than
+		 * append and qsort. Also offer a batch-insert
+		 * interface, which should be a nice little
+		 * performance win.
+		 */
+		qsort(self->lines, self->numlines, sizeof(line), &linecmp);
+	}
+	return 0;
+}
+
+static PyMappingMethods lazymanifest_mapping_methods = {
+	(lenfunc)lazymanifest_size,             /* mp_length */
+	(binaryfunc)lazymanifest_getitem,       /* mp_subscript */
+	(objobjargproc)lazymanifest_setitem,    /* mp_ass_subscript */
+};
+
+/* sequence methods (important or __contains__ builds an iterator */
+
+static int lazymanifest_contains(lazymanifest *self, PyObject *key)
+{
+	line needle;
+	line *hit;
+	if (!PyString_Check(key)) {
+		/* Our keys are always strings, so if the contains
+		 * check is for a non-string, just return false. */
+		return 0;
+	}
+	needle.start = PyString_AsString(key);
+	hit = bsearch(&needle, self->lines, self->numlines, sizeof(line),
+		      &linecmp);
+	if (!hit || hit->deleted) {
+		return 0;
+	}
+	return 1;
+}
+
+static PySequenceMethods lazymanifest_seq_meths = {
+	(lenfunc)lazymanifest_size, /* sq_length */
+	0, /* sq_concat */
+	0, /* sq_repeat */
+	0, /* sq_item */
+	0, /* sq_slice */
+	0, /* sq_ass_item */
+	0, /* sq_ass_slice */
+	(objobjproc)lazymanifest_contains, /* sq_contains */
+	0, /* sq_inplace_concat */
+	0, /* sq_inplace_repeat */
+};
+
+
+/* Other methods (copy, diff, etc) */
+static PyTypeObject lazymanifestType;
+
+/* If the manifest has changes, build the new manifest text and reindex it. */
+static int compact(lazymanifest *self) {
+	int i;
+	ssize_t need = 0;
+	char *data;
+	line *src, *dst;
+	PyObject *pydata;
+	if (!self->dirty)
+		return 0;
+	for (i = 0; i < self->numlines; i++) {
+		if (!self->lines[i].deleted) {
+			need += self->lines[i].len;
+		}
+	}
+	pydata = PyString_FromStringAndSize(NULL, need);
+	if (!pydata)
+		return -1;
+	data = PyString_AsString(pydata);
+	if (!data) {
+		return -1;
+	}
+	src = self->lines;
+	dst = self->lines;
+	for (i = 0; i < self->numlines; i++, src++) {
+		char *tofree = NULL;
+		if (src->from_malloc) {
+			tofree = src->start;
+		}
+		if (!src->deleted) {
+			memcpy(data, src->start, src->len);
+			*dst = *src;
+			dst->start = data;
+			dst->from_malloc = false;
+			data += dst->len;
+			dst++;
+		}
+		free(tofree);
+	}
+	Py_DECREF(self->pydata);
+	self->pydata = pydata;
+	self->numlines = self->livelines;
+	self->dirty = false;
+	return 0;
+}
+
+static PyObject *lazymanifest_text(lazymanifest *self)
+{
+	if (compact(self) != 0) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+	Py_INCREF(self->pydata);
+	return self->pydata;
+}
+
+static lazymanifest *lazymanifest_copy(lazymanifest *self)
+{
+	lazymanifest *copy = NULL;
+	if (compact(self) != 0) {
+		goto nomem;
+	}
+	copy = PyObject_New(lazymanifest, &lazymanifestType);
+	if (!copy) {
+		goto nomem;
+	}
+	copy->numlines = self->numlines;
+	copy->livelines = self->livelines;
+	copy->dirty = false;
+	copy->lines = malloc(self->maxlines *sizeof(line));
+	if (!copy->lines) {
+		goto nomem;
+	}
+	memcpy(copy->lines, self->lines, self->numlines * sizeof(line));
+	copy->maxlines = self->maxlines;
+	copy->pydata = self->pydata;
+	Py_INCREF(copy->pydata);
+	return copy;
+ nomem:
+	PyErr_NoMemory();
+	Py_XDECREF(copy);
+	return NULL;
+}
+
+static lazymanifest *lazymanifest_filtercopy(
+	lazymanifest *self, PyObject *matchfn)
+{
+	lazymanifest *copy = NULL;
+	int i;
+	if (!PyCallable_Check(matchfn)) {
+		PyErr_SetString(PyExc_TypeError, "matchfn must be callable");
+		return NULL;
+	}
+	/* compact ourselves first to avoid double-frees later when we
+	 * compact tmp so that it doesn't have random pointers to our
+	 * underlying from_malloc-data (self->pydata is safe) */
+	if (compact(self) != 0) {
+		goto nomem;
+	}
+	copy = PyObject_New(lazymanifest, &lazymanifestType);
+	copy->dirty = true;
+	copy->lines = malloc(self->maxlines * sizeof(line));
+	if (!copy->lines) {
+		goto nomem;
+	}
+	copy->maxlines = self->maxlines;
+	copy->numlines = 0;
+	copy->pydata = self->pydata;
+	Py_INCREF(self->pydata);
+	for (i = 0; i < self->numlines; i++) {
+		PyObject *arg = PyString_FromString(self->lines[i].start);
+		PyObject *arglist = PyTuple_Pack(1, arg);
+		PyObject *result = PyObject_CallObject(matchfn, arglist);
+		Py_DECREF(arglist);
+		Py_DECREF(arg);
+		/* if the callback raised an exception, just let it
+		 * through and give up */
+		if (!result) {
+			free(copy->lines);
+			Py_DECREF(self->pydata);
+			return NULL;
+		}
+		if (PyObject_IsTrue(result)) {
+			assert(!(self->lines[i].from_malloc));
+			copy->lines[copy->numlines++] = self->lines[i];
+		}
+		Py_DECREF(result);
+	}
+	copy->livelines = copy->numlines;
+	return copy;
+ nomem:
+	PyErr_NoMemory();
+	Py_XDECREF(copy);
+	return NULL;
+}
+
+static PyObject *lazymanifest_diff(lazymanifest *self, PyObject *args)
+{
+	lazymanifest *other;
+	PyObject *pyclean = NULL;
+	bool listclean;
+	PyObject *emptyTup = NULL, *ret = NULL;
+	PyObject *es;
+	int sneedle = 0, oneedle = 0;
+	if (!PyArg_ParseTuple(args, "O!|O", &lazymanifestType, &other, &pyclean)) {
+		return NULL;
+	}
+	listclean = (!pyclean) ? false : PyObject_IsTrue(pyclean);
+	es = PyString_FromString("");
+	if (!es) {
+		goto nomem;
+	}
+	emptyTup = PyTuple_Pack(2, Py_None, es);
+	Py_DECREF(es);
+	if (!emptyTup) {
+		goto nomem;
+	}
+	ret = PyDict_New();
+	if (!ret) {
+		goto nomem;
+	}
+	while (sneedle != self->numlines || oneedle != other->numlines) {
+		line *left = self->lines + sneedle;
+		line *right = other->lines + oneedle;
+		int result;
+		PyObject *key;
+		PyObject *outer;
+		/* If we're looking at a deleted entry and it's not
+		 * the end of the manifest, just skip it. */
+		if (left->deleted && sneedle < self->numlines) {
+			sneedle++;
+			continue;
+		}
+		if (right->deleted && oneedle < other->numlines) {
+			oneedle++;
+			continue;
+		}
+		/* if we're at the end of either manifest, then we
+		 * know the remaining items are adds so we can skip
+		 * the strcmp. */
+		if (sneedle == self->numlines) {
+			result = 1;
+		} else if (oneedle == other->numlines) {
+			result = -1;
+		} else {
+			result = linecmp(left, right);
+		}
+		key = result <= 0 ?
+			PyString_FromString(left->start) :
+			PyString_FromString(right->start);
+		if (!key)
+			goto nomem;
+		if (result < 0) {
+			PyObject *l = hashflags(left);
+			if (!l) {
+				goto nomem;
+			}
+			outer = PyTuple_Pack(2, l, emptyTup);
+			Py_DECREF(l);
+			if (!outer) {
+				goto nomem;
+			}
+			PyDict_SetItem(ret, key, outer);
+			Py_DECREF(outer);
+			sneedle++;
+		} else if (result > 0) {
+			PyObject *r = hashflags(right);
+			if (!r) {
+				goto nomem;
+			}
+			outer = PyTuple_Pack(2, emptyTup, r);
+			Py_DECREF(r);
+			if (!outer) {
+				goto nomem;
+			}
+			PyDict_SetItem(ret, key, outer);
+			Py_DECREF(outer);
+			oneedle++;
+		} else {
+			/* file exists in both manifests */
+			if (left->len != right->len
+			    || memcmp(left->start, right->start, left->len)
+			    || left->hash_suffix != right->hash_suffix) {
+				PyObject *l = hashflags(left);
+				PyObject *r;
+				if (!l) {
+					goto nomem;
+				}
+				r = hashflags(right);
+				if (!r) {
+					Py_DECREF(l);
+					goto nomem;
+				}
+				outer = PyTuple_Pack(2, l, r);
+				Py_DECREF(l);
+				Py_DECREF(r);
+				if (!outer) {
+					goto nomem;
+				}
+				PyDict_SetItem(ret, key, outer);
+				Py_DECREF(outer);
+			} else if (listclean) {
+				PyDict_SetItem(ret, key, Py_None);
+			}
+			sneedle++;
+			oneedle++;
+		}
+		Py_DECREF(key);
+	}
+	Py_DECREF(emptyTup);
+	return ret;
+ nomem:
+	PyErr_NoMemory();
+	Py_XDECREF(ret);
+	Py_XDECREF(emptyTup);
+	return NULL;
+}
+
+static PyMethodDef lazymanifest_methods[] = {
+	{"copy", (PyCFunction)lazymanifest_copy, METH_NOARGS,
+	 "Make a copy of this lazymanifest."},
+	{"filtercopy", (PyCFunction)lazymanifest_filtercopy, METH_O,
+	 "Make a copy of this manifest filtered by matchfn."},
+	{"diff", (PyCFunction)lazymanifest_diff, METH_VARARGS,
+	 "Compare this lazymanifest to another one."},
+	{"text", (PyCFunction)lazymanifest_text, METH_NOARGS,
+	 "Encode this manifest to text."},
+	{NULL},
+};
+
+static PyTypeObject lazymanifestType = {
+	PyObject_HEAD_INIT(NULL)
+	0,                                                /* ob_size */
+	"parsers.lazymanifest",                           /* tp_name */
+	sizeof(lazymanifest),                             /* tp_basicsize */
+	0,                                                /* tp_itemsize */
+	(destructor)lazymanifest_dealloc,                 /* tp_dealloc */
+	0,                                                /* tp_print */
+	0,                                                /* tp_getattr */
+	0,                                                /* tp_setattr */
+	0,                                                /* tp_compare */
+	0,                                                /* tp_repr */
+	0,                                                /* tp_as_number */
+	&lazymanifest_seq_meths,                          /* tp_as_sequence */
+	&lazymanifest_mapping_methods,                    /* tp_as_mapping */
+	0,                                                /* tp_hash */
+	0,                                                /* tp_call */
+	0,                                                /* tp_str */
+	0,                                                /* tp_getattro */
+	0,                                                /* tp_setattro */
+	0,                                                /* tp_as_buffer */
+	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_SEQUENCE_IN, /* tp_flags */
+	"TODO(augie)",                                    /* tp_doc */
+	0,                                                /* tp_traverse */
+	0,                                                /* tp_clear */
+	0,                                                /* tp_richcompare */
+	0,                                             /* tp_weaklistoffset */
+	(getiterfunc)lazymanifest_getiter,                /* tp_iter */
+	0,                                                /* tp_iternext */
+	lazymanifest_methods,                             /* tp_methods */
+	0,                                                /* tp_members */
+	0,                                                /* tp_getset */
+	0,                                                /* tp_base */
+	0,                                                /* tp_dict */
+	0,                                                /* tp_descr_get */
+	0,                                                /* tp_descr_set */
+	0,                                                /* tp_dictoffset */
+	(initproc)lazymanifest_init,                      /* tp_init */
+	0,                                                /* tp_alloc */
+};
+
+void manifest_module_init(PyObject * mod)
+{
+	lazymanifestType.tp_new = PyType_GenericNew;
+	if (PyType_Ready(&lazymanifestType) < 0)
+		return;
+	Py_INCREF(&lazymanifestType);
+
+	PyModule_AddObject(mod, "lazymanifest",
+			   (PyObject *)&lazymanifestType);
+}
diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -71,7 +71,7 @@  static inline int hexdigit(const char *p
 /*
  * Turn a hex-encoded string into binary.
  */
-static PyObject *unhexlify(const char *str, int len)
+PyObject *unhexlify(const char *str, int len)
 {
 	PyObject *ret;
 	char *d;
@@ -2318,6 +2318,7 @@  static PyMethodDef methods[] = {
 };
 
 void dirs_module_init(PyObject *mod);
+void manifest_module_init(PyObject *mod);
 
 static void module_init(PyObject *mod)
 {
@@ -2332,6 +2333,7 @@  static void module_init(PyObject *mod)
 	PyModule_AddStringConstant(mod, "versionerrortext", versionerrortext);
 
 	dirs_module_init(mod);
+	manifest_module_init(mod);
 
 	indexType.tp_new = PyType_GenericNew;
 	if (PyType_Ready(&indexType) < 0 ||
diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -493,6 +493,7 @@  extmodules = [
     Extension('mercurial.mpatch', ['mercurial/mpatch.c'],
               depends=common_depends),
     Extension('mercurial.parsers', ['mercurial/dirs.c',
+                                    'mercurial/manifest.c',
                                     'mercurial/parsers.c',
                                     'mercurial/pathencode.c'],
               depends=common_depends),
diff --git a/tests/test-manifest.py b/tests/test-manifest.py
new file mode 100644
--- /dev/null
+++ b/tests/test-manifest.py
@@ -0,0 +1,221 @@ 
+import binascii
+import unittest
+import itertools
+
+import silenttestrunner
+
+from mercurial import parsers
+
+HASH_1 = '1' * 40
+HASH_2 = 'f' * 40
+HASH_3 = '1234567890abcdef0987654321deadbeef0fcafe'
+A_SHORT_MANIFEST = (
+    'bar/baz/qux.py\0%(hash2)s%(flag2)s\n'
+    'foo\0%(hash1)s%(flag1)s\n'
+    ) % {'hash1': HASH_1,
+         'flag1': '',
+         'hash2': HASH_2,
+         'flag2': 'l',
+         }
+
+HUGE_MANIFEST_ENTRIES = 200001
+
+A_HUGE_MANIFEST = ''.join(sorted(
+    'file%d\0%s%s\n' % (i, h, f) for i, h, f in
+    itertools.izip(xrange(200001),
+                   itertools.cycle((HASH_1, HASH_2)),
+                   itertools.cycle(('', 'x', 'l')))))
+
+class testmanifest(unittest.TestCase):
+
+    def assertIn(self, thing, container, msg=None):
+        # assertIn new in 2.7, use it if available, otherwise polyfill
+        sup = getattr(unittest.TestCase, 'assertIn', False)
+        if sup:
+            return sup(self, thing, container, msg=msg)
+        if not msg:
+            msg = 'Expected %r in %r' % (thing, container)
+        self.assert_(thing in container, msg)
+
+    def testEmptyManifest(self):
+        m = parsers.lazymanifest('')
+        self.assertEqual(0, len(m))
+        self.assertEqual([], list(m))
+
+    def testManifest(self):
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        want = [
+            ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
+            ('foo', binascii.unhexlify(HASH_1), ''),
+            ]
+        self.assertEqual(len(want), len(m))
+        self.assertEqual(want, list(m))
+        self.assertEqual((binascii.unhexlify(HASH_1), ''), m['foo'])
+        self.assertRaises(KeyError, lambda : m['wat'])
+        self.assertEqual((binascii.unhexlify(HASH_2), 'l'),
+                         m['bar/baz/qux.py'])
+
+    def testSetItem(self):
+        want = binascii.unhexlify(HASH_1), ''
+
+        m = parsers.lazymanifest('')
+        m['a'] = want
+        self.assertIn('a', m)
+        self.assertEqual(want, m['a'])
+        self.assertEqual('a\0' + HASH_1 + '\n', m.text())
+
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m['a'] = want
+        self.assertEqual(want, m['a'])
+        self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
+                         m.text())
+        m2 = m.copy()
+        del m
+        del m2 # make sure we don't double free() anything
+
+    def testCompaction(self):
+        unhex = binascii.unhexlify
+        h1, h2 = unhex(HASH_1), unhex(HASH_2)
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m['alpha'] = h1, ''
+        m['beta'] = h2, ''
+        del m['foo']
+        want = 'alpha\0%s\nbar/baz/qux.py\0%sl\nbeta\0%s\n' % (
+            HASH_1, HASH_2, HASH_2)
+        self.assertEqual(want, m.text())
+        self.assertEqual(3, len(m))
+        self.assertEqual((h1, ''), m['alpha'])
+        self.assertEqual((h2, ''), m['beta'])
+        self.assertRaises(KeyError, lambda : m['foo'])
+        w = [('alpha', h1, ''), ('bar/baz/qux.py', h2, 'l'), ('beta', h2, '')]
+        self.assertEqual(w, list(m))
+
+    def testSetGetNodeSuffix(self):
+        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        h, f = m['foo']
+        want = h + 'a', f
+        # Merge code wants to set 21-byte fake hashes at times
+        m['foo'] = want
+        self.assertEqual(want, m['foo'])
+        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
+                          ('foo', binascii.unhexlify(HASH_1) + 'a', '')],
+                         list(m))
+        # Sometimes it even tries a 22-byte fake hash, but we can
+        # return 21 and it'll work out
+        m['foo'] = want[0] + '+', f
+        self.assertEqual(want, m['foo'])
+        # make sure the suffix survives a copy
+        m2 = m.filtercopy(lambda x: x == 'foo')
+        self.assertEqual(want, m2['foo'])
+        self.assertEqual(1, len(m2))
+        self.assertEqual(('foo\0%s\n' % HASH_1), m2.text())
+        m2 = m.copy()
+        self.assertEqual(want, m2['foo'])
+        # suffix with iteration
+        self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
+                          ('foo', want[0], '')], list(m))
+        # shows up in diff
+        self.assertEqual({'foo': (want, (h, ''))}, m.diff(clean))
+        self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
+
+    def testFilterCopyException(self):
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        def filt(path):
+            if path == 'foo':
+                assert False
+            return True
+        self.assertRaises(AssertionError, m.filtercopy, filt)
+
+    def testRemoveItem(self):
+        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        del m['foo']
+        self.assertRaises(KeyError, lambda : m['foo'])
+        self.assertEqual(1, len(m))
+        self.assertEqual(1, len(list(m)))
+
+    def testManifestDiff(self):
+        MISSING = (None, '')
+        addl = 'z-only-in-left\0' + HASH_1 + '\n'
+        addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
+        left = parsers.lazymanifest(
+            A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
+        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
+        want = {
+            'foo': ((binascii.unhexlify(HASH_3), 'x'),
+                    (binascii.unhexlify(HASH_1), '')),
+            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
+            'z-only-in-right': (MISSING, (binascii.unhexlify(HASH_2), 'x')),
+            }
+        self.assertEqual(want, left.diff(right))
+
+        want = {
+            'bar/baz/qux.py': (MISSING, (binascii.unhexlify(HASH_2), 'l')),
+            'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
+            'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
+            }
+        self.assertEqual(want, parsers.lazymanifest('').diff(left))
+
+        want = {
+            'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'), MISSING),
+            'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
+            'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
+            }
+        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
+        copy = right.copy()
+        del copy['z-only-in-right']
+        del right['foo']
+        want = {
+            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
+            'z-only-in-right': ((binascii.unhexlify(HASH_2), 'x'), MISSING),
+            }
+        self.assertEqual(want, right.diff(copy))
+
+        short = parsers.lazymanifest(A_SHORT_MANIFEST)
+        pruned = short.copy()
+        del pruned['foo']
+        want = {
+            'foo': ((binascii.unhexlify(HASH_1), ''), MISSING),
+            }
+        self.assertEqual(want, short.diff(pruned))
+        want = {
+            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
+            }
+        self.assertEqual(want, pruned.diff(short))
+        want = {
+            'bar/baz/qux.py': None,
+            'foo': (MISSING, (binascii.unhexlify(HASH_1), '')),
+            }
+        self.assertEqual(want, pruned.diff(short, True))
+
+    def testReversedLines(self):
+        backwards = ''.join(
+            l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if l)
+        try:
+            parsers.lazymanifest(backwards)
+            self.fail('Should have raised ValueError')
+        except ValueError, v:
+            self.assertIn('Manifest lines not in sorted order.', str(v))
+
+    def testNoTerminalNewline(self):
+        try:
+            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
+            self.fail('Should have raised ValueError')
+        except ValueError, v:
+            self.assertIn('Manifest did not end in a newline.', str(v))
+
+    def testNoNewLineAtAll(self):
+        try:
+            parsers.lazymanifest('wat')
+            self.fail('Should have raised ValueError')
+        except ValueError, v:
+            self.assertIn('Manifest did not end in a newline.', str(v))
+
+    def testHugeManifest(self):
+        m = parsers.lazymanifest(A_HUGE_MANIFEST)
+        self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
+        self.assertEqual(len(m), len(list(m)))
+
+
+if __name__ == '__main__':
+    silenttestrunner.main(__name__)