Submitter | Augie Fackler |
---|---|
Date | March 3, 2015, 11:05 p.m. |
Message ID | <575515f3cdc7bc8cb826.1425423934@198.17.16.172.in-addr.arpa> |
Download | mbox | patch |
Permalink | /patch/7890/ |
State | Superseded |
Headers | show |
Comments
Besides the comments below, I assume this hasn't changed much, so it still looks good to me. On Tue, Mar 3, 2015 at 3:08 PM Augie Fackler <raf@durin42.com> wrote: > # HG changeset patch > # User Augie Fackler <augie@google.com> > # Date 1421188298 28800 > # Tue Jan 13 14:31:38 2015 -0800 > # Node ID 575515f3cdc7bc8cb826ea17fa365d1f359ab88f > # Parent 390410a6545d9088dc84392009f51500e3935a1c > manifest.c: new extension code to lazily parse manifests > > This lets us iterate manifests in order, but do a _lot_ less work in > the common case when we only care about a few manifest entries. > > Many thanks to Mike Edgar for reviewing this in advance of it going > out to the list, which caught many things I missed. > > This version of the patch includes C89 fixes from Sean Farley and > many correctness/efficiency cleanups from Martin von > Zweigbergk. Thanks to both! > > diff --git a/mercurial/manifest.c b/mercurial/manifest.c > new file mode 100644 > --- /dev/null > +++ b/mercurial/manifest.c > @@ -0,0 +1,841 @@ > +/* > + * manifest.c - manifest type that does on-demand parsing. > + * > + * Copyright 2015, Google Inc. > + * > + * This software may be used and distributed according to the terms of > + * the GNU General Public License, incorporated herein by reference. > + */ > +#include <assert.h> > +#include <string.h> > +#include <stdlib.h> > + > +#include <Python.h> > + > +/* VC9 doesn't include bool and lacks stdbool.h based on my searching */ > +#ifdef _MSC_VER > +#define true 1 > +#define false 0 > +typedef unsigned char bool; > +#else > +#include <stdbool.h> > +#endif > + > +#define DEFAULT_LINES 100000 > + > +typedef struct { > + char *start; > + Py_ssize_t len; /* length of line including terminal newline */ > + char hash_suffix; > + bool from_malloc; > + bool deleted; > +} line; > + > +typedef struct { > + PyObject_HEAD > + PyObject *pydata; > + line *lines; > + int numlines; /* number of line entries */ > + int livelines; /* number of non-deleted lines */ > + int maxlines; /* allocated number of lines */ > + bool dirty; > +} lazymanifest; > + > +#define MANIFEST_OOM -1 > +#define MANIFEST_NOT_SORTED -2 > +#define MANIFEST_MALFORMED -3 > I think I suggested on an earlier version that these could be replaced by only -1 and calls to PyErr_* before -1 is returned instead of having the caller (of find_lines()) call PyErr_*. > + > +/* defined in parsers.c */ > +PyObject *unhexlify(const char *str, int len); > + > +/* get the length of the path for a line */ > +static size_t pathlen(line *l) { > + return strlen(l->start); > +} > + > +/* get the node value of a single line */ > +static PyObject *nodeof(line *l) { > + char *s = l->start; > + ssize_t llen = pathlen(l); > + PyObject *hash = unhexlify(s + llen + 1, 40); > + if (!hash) { > + return NULL; > + } > + if (l->hash_suffix != '\0') { > + char newhash[21]; > + memcpy(newhash, PyString_AsString(hash), 20); > + Py_DECREF(hash); > + newhash[20] = l->hash_suffix; > + hash = PyString_FromStringAndSize(newhash, 21); > + } > + return hash; > +} > + > +/* get the node hash and flags of a line as a tuple */ > +static PyObject *hashflags(line *l) > +{ > + char *s = l->start; > + size_t plen = pathlen(l); > + PyObject *hash = nodeof(l); > + > + /* 40 for hash, 1 for null byte, 1 for newline */ > + size_t hplen = plen + 42; > + Py_ssize_t flen = l->len - hplen; > + PyObject *flags; > + PyObject *tup; > + > + if (!hash) > + return NULL; > + flags = PyString_FromStringAndSize(s + hplen - 1, flen); > + if (!flags) { > + Py_DECREF(hash); > + return NULL; > + } > + tup = PyTuple_Pack(2, hash, flags); > + Py_DECREF(flags); > + Py_DECREF(hash); > + return tup; > +} > + > +/* if we're about to run out of space in the line index, add more */ > +static bool realloc_if_full(lazymanifest *self) > +{ > + if (self->numlines == self->maxlines) { > + self->maxlines *= 2; > + self->lines = realloc(self->lines, self->maxlines * > sizeof(line)); > + } > + return self->lines; > +} > + > +/* > + * Find the line boundaries in the manifest that 'data' points to and > store > + * information about each line in 'self'. > + */ > +static int find_lines(lazymanifest *self, char *data, Py_ssize_t len) > +{ > + char *prev = NULL; > + while (len > 0) { > + line *l; > + char *next = memchr(data, '\n', len); > + if (!next) { > + return MANIFEST_MALFORMED; > + } > + next++; /* advance past newline */ > + if (!realloc_if_full(self)) { > + return MANIFEST_OOM; /* no memory */ > + } > + if (prev && strcmp(prev, data) > -1) { > + /* This data isn't sorted, so we have to abort. */ > + return MANIFEST_NOT_SORTED; > + } > + l = self->lines + ((self->numlines)++); > + l->start = data; > + l->len = next - data; > + l->hash_suffix = '\0'; > + l->from_malloc = false; > + l->deleted = false; > + len = len - l->len; > + prev = data; > + data = next; > + } > + self->livelines = self->numlines; > + return 0; > +} > + > +static int lazymanifest_init(lazymanifest *self, PyObject *args) > +{ > + char *data; > + Py_ssize_t len; > + int err, ret; > + PyObject *pydata; > + if (!PyArg_ParseTuple(args, "S", &pydata)) { > + return -1; > + } > + err = PyString_AsStringAndSize(pydata, &data, &len); > + > + self->dirty = false; > + if (err == -1) > + return -1; > + self->pydata = pydata; > + Py_INCREF(self->pydata); > + Py_BEGIN_ALLOW_THREADS > + self->lines = malloc(DEFAULT_LINES * sizeof(line)); > + self->maxlines = DEFAULT_LINES; > + self->numlines = 0; > + if (!self->lines) > + ret = -2; > -2 means "not sorted", no? Shouldn't this be OOM? Call PyErr_NoMemory() here if you agree with my suggestion above. > + else > + ret = find_lines(self, data, len); > + Py_END_ALLOW_THREADS > + switch (ret) { > + case 0: > + break; > + case MANIFEST_OOM: > + PyErr_NoMemory(); > + break; > + case MANIFEST_NOT_SORTED: > + PyErr_Format(PyExc_ValueError, > + "Manifest lines not in sorted order."); > + break; > + case MANIFEST_MALFORMED: > + PyErr_Format(PyExc_ValueError, > + "Manifest did not end in a newline."); > + break; > + default: > + PyErr_Format(PyExc_ValueError, > + "Unknown problem parsing manifest."); > + } > + return ret == 0 ? 0 : -1; > +} > + > +static void lazymanifest_dealloc(lazymanifest *self) > +{ > + /* free any extra lines we had to allocate */ > + int i; > + for (i = 0; i < self->numlines; i++) { > + if (self->lines[i].from_malloc) { > + free(self->lines[i].start); > + } > + } > + if (self->lines) { > + free(self->lines); > + self->lines = NULL; > + } > + if (self->pydata) { > + Py_DECREF(self->pydata); > + self->pydata = NULL; > + } > + PyObject_Del(self); > +} > + > +/* iteration support */ > + > +typedef struct { > + PyObject_HEAD lazymanifest *m; > + Py_ssize_t pos; > +} lmIter; > + > +static void lmiter_dealloc(PyObject *o) > +{ > + lmIter *self = (lmIter *)o; > + Py_DECREF(self->m); > + PyObject_Del(self); > +} > + > +static PyObject *lmiter_iternext(PyObject *o) > +{ > + size_t pl; > + line *l; > + Py_ssize_t consumed; > + PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL; > + lmIter *self = (lmIter *)o; > + do { > + self->pos++; > + if (self->pos >= self->m->numlines) { > + goto bail; > + } > + /* skip over deleted manifest entries */ > + } while (self->m->lines[self->pos].deleted); > + l = self->m->lines + self->pos; > + pl = pathlen(l); > + path = PyString_FromStringAndSize(l->start, pl); > + hash = nodeof(l); > + consumed = pl + 41; > + flags = PyString_FromStringAndSize(l->start + consumed, > + > l->len - consumed - 1); > + if (!flags) { > + goto bail; > + } > + ret = PyTuple_Pack(3, path, hash, flags); > + bail: > + Py_XDECREF(path); > + Py_XDECREF(hash); > + Py_XDECREF(flags); > + return ret; > +} > + > +static PyTypeObject lazymanifestIterator = { > + PyObject_HEAD_INIT(NULL) > + 0, /*ob_size */ > + "parsers.lazymanifest.iterator", /*tp_name */ > + sizeof(lmIter), /*tp_basicsize */ > + 0, /*tp_itemsize */ > + lmiter_dealloc, /*tp_dealloc */ > + 0, /*tp_print */ > + 0, /*tp_getattr */ > + 0, /*tp_setattr */ > + 0, /*tp_compare */ > + 0, /*tp_repr */ > + 0, /*tp_as_number */ > + 0, /*tp_as_sequence */ > + 0, /*tp_as_mapping */ > + 0, /*tp_hash */ > + 0, /*tp_call */ > + 0, /*tp_str */ > + 0, /*tp_getattro */ > + 0, /*tp_setattro */ > + 0, /*tp_as_buffer */ > + /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to > + use tp_iter and tp_iternext fields. */ > + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER, > + "Iterator for a lazymanifest.", /* tp_doc */ > + 0, /* tp_traverse */ > + 0, /* tp_clear */ > + 0, /* tp_richcompare */ > + 0, /* tp_weaklistoffset */ > + PyObject_SelfIter, /* tp_iter: __iter__() method */ > + lmiter_iternext, /* tp_iternext: next() method */ > +}; > + > +static lazymanifest *lazymanifest_copy(lazymanifest *self); > + > +static PyObject *lazymanifest_getiter(lazymanifest *self) > +{ > + lmIter *i = NULL; > + lazymanifest *t = lazymanifest_copy(self); > + if (!t) { > + PyErr_NoMemory(); > + return NULL; > + } > + i = PyObject_New(lmIter, &lazymanifestIterator); > + if (i) { > + i->m = t; > + i->pos = -1; > + } else { > + Py_DECREF(t); > + PyErr_NoMemory(); > + } > + return (PyObject *)i; > +} > + > +/* __getitem__ and __setitem__ support */ > + > +static Py_ssize_t lazymanifest_size(lazymanifest *self) > +{ > + return self->livelines; > +} > + > +static int linecmp(const void *left, const void *right) > +{ > + return strcmp(((const line *)left)->start, > + ((const line *)right)->start); > +} > + > +static PyObject *lazymanifest_getitem(lazymanifest *self, PyObject *key) > +{ > + line needle; > + line *hit; > + if (!PyString_Check(key)) { > + PyErr_Format(PyExc_TypeError, > + "getitem: manifest keys must be a string."); > + return NULL; > + } > + needle.start = PyString_AsString(key); > + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), > + &linecmp); > + if (!hit || hit->deleted) { > + PyErr_Format(PyExc_KeyError, "No such manifest entry."); > + return NULL; > + } > + return hashflags(hit); > +} > + > +static int lazymanifest_delitem(lazymanifest *self, PyObject *key) > +{ > + line needle; > + line *hit; > + if (!PyString_Check(key)) { > + PyErr_Format(PyExc_TypeError, > + "delitem: manifest keys must be a string."); > + return -1; > + } > + needle.start = PyString_AsString(key); > + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), > + &linecmp); > + if (!hit || hit->deleted) { > + PyErr_Format(PyExc_KeyError, > + "Tried to delete nonexistent manifest > entry."); > + return -1; > + } > + self->dirty = true; > + hit->deleted = true; > + self->livelines--; > + return 0; > +} > + > +static int lazymanifest_setitem( > + lazymanifest *self, PyObject *key, PyObject *value) > +{ > + char *path; > + Py_ssize_t plen; > + PyObject *pyhash; > + Py_ssize_t hlen; > + char *hash; > + PyObject *pyflags; > + char *flags; > + Py_ssize_t flen; > + size_t dlen; > + char *dest; > + int i; > + line new; > + line *hit; > + if (!PyString_Check(key)) { > + PyErr_Format(PyExc_TypeError, > + "setitem: manifest keys must be a string."); > + return -1; > + } > + if (!value) { > + return lazymanifest_delitem(self, key); > + } > + if (!PyTuple_Check(value) || PyTuple_Size(value) != 2) { > + PyErr_Format(PyExc_TypeError, > + "Manifest values must be a tuple of (node, > flags)."); > + return -1; > + } > + if (PyString_AsStringAndSize(key, &path, &plen) == -1) { > + return -1; > + } > + > + pyhash = PyTuple_GetItem(value, 0); > + if (!PyString_Check(pyhash)) { > + PyErr_Format(PyExc_TypeError, > + "node must be a 20-byte string"); > + return -1; > + } > + hlen = PyString_Size(pyhash); > + /* Some parts of the codebase try and set 21 or 22 > + * byte "hash" values in order to perturb things for > + * status. We have to preserve at least the 21st > + * byte. Sigh. If there's a 22nd byte, we drop it on > + * the floor, which works fine. > + */ > + if (hlen != 20 && hlen != 21 && hlen != 22) { > + PyErr_Format(PyExc_TypeError, > + "node must be a 20-byte string"); > + return -1; > + } > + hash = PyString_AsString(pyhash); > + > + pyflags = PyTuple_GetItem(value, 1); > + if (!PyString_Check(pyflags) || PyString_Size(pyflags) > 1) { > + PyErr_Format(PyExc_TypeError, > + "flags must a 0 or 1 byte string"); > + return -1; > + } > + if (PyString_AsStringAndSize(pyflags, &flags, &flen) == -1) { > + return -1; > + } > + /* one null byte and one newline */ > + dlen = plen + 41 + flen + 1; > + dest = malloc(dlen); > + if (!dest) { > + PyErr_NoMemory(); > + return -1; > + } > + memcpy(dest, path, plen + 1); > + for (i = 0; i < 20; i++) { > + sprintf(dest + plen + 1 + (i * 2), "%02hhx", hash[i]); > + } > + memcpy(dest + plen + 41, flags, flen); > + dest[plen + 41 + flen] = '\n'; > + new.start = dest; > + new.len = dlen; > + new.hash_suffix = '\0'; > + if (hlen > 20) { > + new.hash_suffix = hash[20]; > + } > + new.from_malloc = true; /* is `start` a pointer we allocated? > */ > + new.deleted = false; /* is this entry deleted? */ > + hit = bsearch(&new, self->lines, self->numlines, > + sizeof(line), &linecmp); > + self->dirty = true; > + if (hit) { > + /* updating a line we already had */ > + if (hit->from_malloc) { > + free(hit->start); > + } > + if (hit->deleted) { > + self->livelines++; > + } > + *hit = new; > + } else { > + /* new line */ > + if (!realloc_if_full(self)) { > + PyErr_NoMemory(); > + return -1; > + } > + self->lines[self->numlines++] = new; > + self->livelines++; > + /* TODO: do a binary search and insert rather than > + * append and qsort. Also offer a batch-insert > + * interface, which should be a nice little > + * performance win. > + */ > + qsort(self->lines, self->numlines, sizeof(line), &linecmp); > + } > + return 0; > +} > + > +static PyMappingMethods lazymanifest_mapping_methods = { > + (lenfunc)lazymanifest_size, /* mp_length */ > + (binaryfunc)lazymanifest_getitem, /* mp_subscript */ > + (objobjargproc)lazymanifest_setitem, /* mp_ass_subscript */ > +}; > + > +/* sequence methods (important or __contains__ builds an iterator */ > + > +static int lazymanifest_contains(lazymanifest *self, PyObject *key) > +{ > + line needle; > + line *hit; > + if (!PyString_Check(key)) { > + /* Our keys are always strings, so if the contains > + * check is for a non-string, just return false. */ > + return 0; > + } > + needle.start = PyString_AsString(key); > + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), > + &linecmp); > + if (!hit || hit->deleted) { > + return 0; > + } > + return 1; > +} > + > +static PySequenceMethods lazymanifest_seq_meths = { > + (lenfunc)lazymanifest_size, /* sq_length */ > + 0, /* sq_concat */ > + 0, /* sq_repeat */ > + 0, /* sq_item */ > + 0, /* sq_slice */ > + 0, /* sq_ass_item */ > + 0, /* sq_ass_slice */ > + (objobjproc)lazymanifest_contains, /* sq_contains */ > + 0, /* sq_inplace_concat */ > + 0, /* sq_inplace_repeat */ > +}; > + > + > +/* Other methods (copy, diff, etc) */ > +static PyTypeObject lazymanifestType; > + > +/* If the manifest has changes, build the new manifest text and reindex > it. */ > +static int compact(lazymanifest *self) { > + int i; > + ssize_t need = 0; > + char *data; > + line *src, *dst; > + PyObject *pydata; > + if (!self->dirty) > + return 0; > + for (i = 0; i < self->numlines; i++) { > + if (!self->lines[i].deleted) { > + need += self->lines[i].len; > + } > + } > + pydata = PyString_FromStringAndSize(NULL, need); > + if (!pydata) > + return -1; > + data = PyString_AsString(pydata); > + if (!data) { > + return -1; > + } > + src = self->lines; > + dst = self->lines; > + for (i = 0; i < self->numlines; i++, src++) { > + char *tofree = NULL; > + if (src->from_malloc) { > + tofree = src->start; > + } > + if (!src->deleted) { > + memcpy(data, src->start, src->len); > + *dst = *src; > + dst->start = data; > + dst->from_malloc = false; > + data += dst->len; > + dst++; > + } > + free(tofree); > + } > + Py_DECREF(self->pydata); > + self->pydata = pydata; > + self->numlines = self->livelines; > + self->dirty = false; > + return 0; > +} > + > +static PyObject *lazymanifest_text(lazymanifest *self) > +{ > + if (compact(self) != 0) { > + PyErr_NoMemory(); > + return NULL; > + } > + Py_INCREF(self->pydata); > + return self->pydata; > +} > + > +static lazymanifest *lazymanifest_copy(lazymanifest *self) > +{ > + lazymanifest *copy = NULL; > + if (compact(self) != 0) { > + goto nomem; > + } > + copy = PyObject_New(lazymanifest, &lazymanifestType); > + if (!copy) { > + goto nomem; > + } > + copy->numlines = self->numlines; > + copy->livelines = self->livelines; > + copy->dirty = false; > + copy->lines = malloc(self->maxlines *sizeof(line)); > + if (!copy->lines) { > + goto nomem; > + } > + memcpy(copy->lines, self->lines, self->numlines * sizeof(line)); > + copy->maxlines = self->maxlines; > + copy->pydata = self->pydata; > + Py_INCREF(copy->pydata); > + return copy; > + nomem: > + PyErr_NoMemory(); > + Py_XDECREF(copy); > + return NULL; > +} > + > +static lazymanifest *lazymanifest_filtercopy( > + lazymanifest *self, PyObject *matchfn) > +{ > + lazymanifest *copy = NULL; > + int i; > + if (!PyCallable_Check(matchfn)) { > + PyErr_SetString(PyExc_TypeError, "matchfn must be > callable"); > + return NULL; > + } > + /* compact ourselves first to avoid double-frees later when we > + * compact tmp so that it doesn't have random pointers to our > + * underlying from_malloc-data (self->pydata is safe) */ > + if (compact(self) != 0) { > + goto nomem; > + } > + copy = PyObject_New(lazymanifest, &lazymanifestType); > + copy->dirty = true; > + copy->lines = malloc(self->maxlines * sizeof(line)); > + if (!copy->lines) { > + goto nomem; > + } > + copy->maxlines = self->maxlines; > + copy->numlines = 0; > + copy->pydata = self->pydata; > + Py_INCREF(self->pydata); > + for (i = 0; i < self->numlines; i++) { > + PyObject *arg = PyString_FromString(self->lines[i].start); > + PyObject *arglist = PyTuple_Pack(1, arg); > + PyObject *result = PyObject_CallObject(matchfn, arglist); > + Py_DECREF(arglist); > + Py_DECREF(arg); > + /* if the callback raised an exception, just let it > + * through and give up */ > + if (!result) { > + free(copy->lines); > + Py_DECREF(self->pydata); > + return NULL; > + } > + if (PyObject_IsTrue(result)) { > + assert(!(self->lines[i].from_malloc)); > + copy->lines[copy->numlines++] = self->lines[i]; > + } > + Py_DECREF(result); > + } > + copy->livelines = copy->numlines; > + return copy; > + nomem: > + PyErr_NoMemory(); > + Py_XDECREF(copy); > + return NULL; > +} > + > +static PyObject *lazymanifest_diff(lazymanifest *self, PyObject *args) > +{ > + lazymanifest *other; > + PyObject *pyclean = NULL; > + bool listclean; > + PyObject *emptyTup = NULL, *ret = NULL; > + PyObject *es; > + int sneedle = 0, oneedle = 0; > + if (!PyArg_ParseTuple(args, "O!|O", &lazymanifestType, &other, > &pyclean)) { > + return NULL; > + } > + listclean = (!pyclean) ? false : PyObject_IsTrue(pyclean); > + es = PyString_FromString(""); > + if (!es) { > + goto nomem; > + } > + emptyTup = PyTuple_Pack(2, Py_None, es); > + Py_DECREF(es); > + if (!emptyTup) { > + goto nomem; > + } > + ret = PyDict_New(); > + if (!ret) { > + goto nomem; > + } > + while (sneedle != self->numlines || oneedle != other->numlines) { > + line *left = self->lines + sneedle; > + line *right = other->lines + oneedle; > + int result; > + PyObject *key; > + PyObject *outer; > + /* If we're looking at a deleted entry and it's not > + * the end of the manifest, just skip it. */ > + if (left->deleted && sneedle < self->numlines) { > + sneedle++; > + continue; > + } > + if (right->deleted && oneedle < other->numlines) { > + oneedle++; > + continue; > + } > + /* if we're at the end of either manifest, then we > + * know the remaining items are adds so we can skip > + * the strcmp. */ > + if (sneedle == self->numlines) { > + result = 1; > + } else if (oneedle == other->numlines) { > + result = -1; > + } else { > + result = linecmp(left, right); > + } > + key = result <= 0 ? > + PyString_FromString(left->start) : > + PyString_FromString(right->start); > + if (!key) > + goto nomem; > + if (result < 0) { > + PyObject *l = hashflags(left); > + if (!l) { > + goto nomem; > + } > + outer = PyTuple_Pack(2, l, emptyTup); > + Py_DECREF(l); > + if (!outer) { > + goto nomem; > + } > + PyDict_SetItem(ret, key, outer); > + Py_DECREF(outer); > + sneedle++; > + } else if (result > 0) { > + PyObject *r = hashflags(right); > + if (!r) { > + goto nomem; > + } > + outer = PyTuple_Pack(2, emptyTup, r); > + Py_DECREF(r); > + if (!outer) { > + goto nomem; > + } > + PyDict_SetItem(ret, key, outer); > + Py_DECREF(outer); > + oneedle++; > + } else { > + /* file exists in both manifests */ > + if (left->len != right->len > + || memcmp(left->start, right->start, left->len) > + || left->hash_suffix != right->hash_suffix) { > + PyObject *l = hashflags(left); > + PyObject *r; > + if (!l) { > + goto nomem; > + } > + r = hashflags(right); > + if (!r) { > + Py_DECREF(l); > + goto nomem; > + } > + outer = PyTuple_Pack(2, l, r); > + Py_DECREF(l); > + Py_DECREF(r); > + if (!outer) { > + goto nomem; > + } > + PyDict_SetItem(ret, key, outer); > + Py_DECREF(outer); > + } else if (listclean) { > + PyDict_SetItem(ret, key, Py_None); > + } > + sneedle++; > + oneedle++; > + } > + Py_DECREF(key); > + } > + Py_DECREF(emptyTup); > + return ret; > + nomem: > + PyErr_NoMemory(); > + Py_XDECREF(ret); > + Py_XDECREF(emptyTup); > + return NULL; > +} > + > +static PyMethodDef lazymanifest_methods[] = { > + {"copy", (PyCFunction)lazymanifest_copy, METH_NOARGS, > + "Make a copy of this lazymanifest."}, > + {"filtercopy", (PyCFunction)lazymanifest_filtercopy, METH_O, > + "Make a copy of this manifest filtered by matchfn."}, > + {"diff", (PyCFunction)lazymanifest_diff, METH_VARARGS, > + "Compare this lazymanifest to another one."}, > + {"text", (PyCFunction)lazymanifest_text, METH_NOARGS, > + "Encode this manifest to text."}, > + {NULL}, > +}; > + > +static PyTypeObject lazymanifestType = { > + PyObject_HEAD_INIT(NULL) > + 0, /* ob_size */ > + "parsers.lazymanifest", /* tp_name */ > + sizeof(lazymanifest), /* tp_basicsize > */ > + 0, /* tp_itemsize */ > + (destructor)lazymanifest_dealloc, /* tp_dealloc */ > + 0, /* tp_print */ > + 0, /* tp_getattr */ > + 0, /* tp_setattr */ > + 0, /* tp_compare */ > + 0, /* tp_repr */ > + 0, /* tp_as_number > */ > + &lazymanifest_seq_meths, /* > tp_as_sequence */ > + &lazymanifest_mapping_methods, /* tp_as_mapping > */ > + 0, /* tp_hash */ > + 0, /* tp_call */ > + 0, /* tp_str */ > + 0, /* tp_getattro */ > + 0, /* tp_setattro */ > + 0, /* tp_as_buffer > */ > + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_SEQUENCE_IN, /* tp_flags */ > + "TODO(augie)", /* tp_doc */ > + 0, /* tp_traverse */ > + 0, /* tp_clear */ > + 0, /* > tp_richcompare */ > + 0, /* > tp_weaklistoffset */ > + (getiterfunc)lazymanifest_getiter, /* tp_iter */ > + 0, /* tp_iternext */ > + lazymanifest_methods, /* tp_methods */ > + 0, /* tp_members */ > + 0, /* tp_getset */ > + 0, /* tp_base */ > + 0, /* tp_dict */ > + 0, /* tp_descr_get > */ > + 0, /* tp_descr_set > */ > + 0, /* tp_dictoffset > */ > + (initproc)lazymanifest_init, /* tp_init */ > + 0, /* tp_alloc */ > +}; > + > +void manifest_module_init(PyObject * mod) > +{ > + lazymanifestType.tp_new = PyType_GenericNew; > + if (PyType_Ready(&lazymanifestType) < 0) > + return; > + Py_INCREF(&lazymanifestType); > + > + PyModule_AddObject(mod, "lazymanifest", > + (PyObject *)&lazymanifestType); > +} > diff --git a/mercurial/parsers.c b/mercurial/parsers.c > --- a/mercurial/parsers.c > +++ b/mercurial/parsers.c > @@ -71,7 +71,7 @@ static inline int hexdigit(const char *p > /* > * Turn a hex-encoded string into binary. > */ > -static PyObject *unhexlify(const char *str, int len) > +PyObject *unhexlify(const char *str, int len) > { > PyObject *ret; > char *d; > @@ -2318,6 +2318,7 @@ static PyMethodDef methods[] = { > }; > > void dirs_module_init(PyObject *mod); > +void manifest_module_init(PyObject *mod); > > static void module_init(PyObject *mod) > { > @@ -2332,6 +2333,7 @@ static void module_init(PyObject *mod) > PyModule_AddStringConstant(mod, "versionerrortext", > versionerrortext); > > dirs_module_init(mod); > + manifest_module_init(mod); > > indexType.tp_new = PyType_GenericNew; > if (PyType_Ready(&indexType) < 0 || > diff --git a/setup.py b/setup.py > --- a/setup.py > +++ b/setup.py > @@ -493,6 +493,7 @@ extmodules = [ > Extension('mercurial.mpatch', ['mercurial/mpatch.c'], > depends=common_depends), > Extension('mercurial.parsers', ['mercurial/dirs.c', > + 'mercurial/manifest.c', > 'mercurial/parsers.c', > 'mercurial/pathencode.c'], > depends=common_depends), > diff --git a/tests/test-manifest.py b/tests/test-manifest.py > new file mode 100644 > --- /dev/null > +++ b/tests/test-manifest.py > @@ -0,0 +1,221 @@ > +import binascii > +import unittest > +import itertools > + > +import silenttestrunner > + > +from mercurial import parsers > + > +HASH_1 = '1' * 40 > +HASH_2 = 'f' * 40 > +HASH_3 = '1234567890abcdef0987654321deadbeef0fcafe' > +A_SHORT_MANIFEST = ( > + 'bar/baz/qux.py\0%(hash2)s%(flag2)s\n' > + 'foo\0%(hash1)s%(flag1)s\n' > + ) % {'hash1': HASH_1, > + 'flag1': '', > + 'hash2': HASH_2, > + 'flag2': 'l', > + } > + > +HUGE_MANIFEST_ENTRIES = 200001 > + > +A_HUGE_MANIFEST = ''.join(sorted( > + 'file%d\0%s%s\n' % (i, h, f) for i, h, f in > + itertools.izip(xrange(200001), > + itertools.cycle((HASH_1, HASH_2)), > + itertools.cycle(('', 'x', 'l'))))) > + > +class testmanifest(unittest.TestCase): > + > + def assertIn(self, thing, container, msg=None): > + # assertIn new in 2.7, use it if available, otherwise polyfill > + sup = getattr(unittest.TestCase, 'assertIn', False) > + if sup: > + return sup(self, thing, container, msg=msg) > + if not msg: > + msg = 'Expected %r in %r' % (thing, container) > + self.assert_(thing in container, msg) > + > + def testEmptyManifest(self): > + m = parsers.lazymanifest('') > + self.assertEqual(0, len(m)) > + self.assertEqual([], list(m)) > + > + def testManifest(self): > + m = parsers.lazymanifest(A_SHORT_MANIFEST) > + want = [ > + ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'), > + ('foo', binascii.unhexlify(HASH_1), ''), > + ] > + self.assertEqual(len(want), len(m)) > + self.assertEqual(want, list(m)) > + self.assertEqual((binascii.unhexlify(HASH_1), ''), m['foo']) > + self.assertRaises(KeyError, lambda : m['wat']) > + self.assertEqual((binascii.unhexlify(HASH_2), 'l'), > + m['bar/baz/qux.py']) > + > + def testSetItem(self): > + want = binascii.unhexlify(HASH_1), '' > + > + m = parsers.lazymanifest('') > + m['a'] = want > + self.assertIn('a', m) > + self.assertEqual(want, m['a']) > + self.assertEqual('a\0' + HASH_1 + '\n', m.text()) > + > + m = parsers.lazymanifest(A_SHORT_MANIFEST) > + m['a'] = want > + self.assertEqual(want, m['a']) > + self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST, > + m.text()) > + m2 = m.copy() > + del m > + del m2 # make sure we don't double free() anything > + > + def testCompaction(self): > + unhex = binascii.unhexlify > + h1, h2 = unhex(HASH_1), unhex(HASH_2) > + m = parsers.lazymanifest(A_SHORT_MANIFEST) > + m['alpha'] = h1, '' > + m['beta'] = h2, '' > + del m['foo'] > + want = 'alpha\0%s\nbar/baz/qux.py\0%sl\nbeta\0%s\n' % ( > + HASH_1, HASH_2, HASH_2) > + self.assertEqual(want, m.text()) > + self.assertEqual(3, len(m)) > + self.assertEqual((h1, ''), m['alpha']) > + self.assertEqual((h2, ''), m['beta']) > + self.assertRaises(KeyError, lambda : m['foo']) > + w = [('alpha', h1, ''), ('bar/baz/qux.py', h2, 'l'), ('beta', h2, > '')] > + self.assertEqual(w, list(m)) > + > + def testSetGetNodeSuffix(self): > + clean = parsers.lazymanifest(A_SHORT_MANIFEST) > + m = parsers.lazymanifest(A_SHORT_MANIFEST) > + h, f = m['foo'] > + want = h + 'a', f > + # Merge code wants to set 21-byte fake hashes at times > + m['foo'] = want > + self.assertEqual(want, m['foo']) > + self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), > 'l'), > + ('foo', binascii.unhexlify(HASH_1) + 'a', '')], > + list(m)) > + # Sometimes it even tries a 22-byte fake hash, but we can > + # return 21 and it'll work out > + m['foo'] = want[0] + '+', f > + self.assertEqual(want, m['foo']) > + # make sure the suffix survives a copy > + m2 = m.filtercopy(lambda x: x == 'foo') > + self.assertEqual(want, m2['foo']) > + self.assertEqual(1, len(m2)) > + self.assertEqual(('foo\0%s\n' % HASH_1), m2.text()) > + m2 = m.copy() > + self.assertEqual(want, m2['foo']) > + # suffix with iteration > + self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), > 'l'), > + ('foo', want[0], '')], list(m)) > + # shows up in diff > + self.assertEqual({'foo': (want, (h, ''))}, m.diff(clean)) > + self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m)) > + > + def testFilterCopyException(self): > + m = parsers.lazymanifest(A_SHORT_MANIFEST) > + def filt(path): > + if path == 'foo': > + assert False > + return True > + self.assertRaises(AssertionError, m.filtercopy, filt) > + > + def testRemoveItem(self): > + m = parsers.lazymanifest(A_SHORT_MANIFEST) > + del m['foo'] > + self.assertRaises(KeyError, lambda : m['foo']) > + self.assertEqual(1, len(m)) > + self.assertEqual(1, len(list(m))) > + > + def testManifestDiff(self): > + MISSING = (None, '') > + addl = 'z-only-in-left\0' + HASH_1 + '\n' > + addr = 'z-only-in-right\0' + HASH_2 + 'x\n' > + left = parsers.lazymanifest( > + A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl) > + right = parsers.lazymanifest(A_SHORT_MANIFEST + addr) > + want = { > + 'foo': ((binascii.unhexlify(HASH_3), 'x'), > + (binascii.unhexlify(HASH_1), '')), > + 'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING), > + 'z-only-in-right': (MISSING, (binascii.unhexlify(HASH_2), > 'x')), > + } > + self.assertEqual(want, left.diff(right)) > + > + want = { > + 'bar/baz/qux.py': (MISSING, (binascii.unhexlify(HASH_2), > 'l')), > + 'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')), > + 'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')), > + } > + self.assertEqual(want, parsers.lazymanifest('').diff(left)) > + > + want = { > + 'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'), > MISSING), > + 'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING), > + 'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING), > + } > + self.assertEqual(want, left.diff(parsers.lazymanifest(''))) > + copy = right.copy() > + del copy['z-only-in-right'] > + del right['foo'] > + want = { > + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), > + 'z-only-in-right': ((binascii.unhexlify(HASH_2), 'x'), > MISSING), > + } > + self.assertEqual(want, right.diff(copy)) > + > + short = parsers.lazymanifest(A_SHORT_MANIFEST) > + pruned = short.copy() > + del pruned['foo'] > + want = { > + 'foo': ((binascii.unhexlify(HASH_1), ''), MISSING), > + } > + self.assertEqual(want, short.diff(pruned)) > + want = { > + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), > + } > + self.assertEqual(want, pruned.diff(short)) > + want = { > + 'bar/baz/qux.py': None, > + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), > + } > + self.assertEqual(want, pruned.diff(short, True)) > + > + def testReversedLines(self): > + backwards = ''.join( > + l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if > l) > + try: > + parsers.lazymanifest(backwards) > + self.fail('Should have raised ValueError') > + except ValueError, v: > + self.assertIn('Manifest lines not in sorted order.', str(v)) > + > + def testNoTerminalNewline(self): > + try: > + parsers.lazymanifest(A_SHORT_MANIFEST + 'wat') > + self.fail('Should have raised ValueError') > + except ValueError, v: > + self.assertIn('Manifest did not end in a newline.', str(v)) > + > + def testNoNewLineAtAll(self): > + try: > + parsers.lazymanifest('wat') > + self.fail('Should have raised ValueError') > + except ValueError, v: > + self.assertIn('Manifest did not end in a newline.', str(v)) > + > + def testHugeManifest(self): > + m = parsers.lazymanifest(A_HUGE_MANIFEST) > + self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m)) > + self.assertEqual(len(m), len(list(m))) > + > + > +if __name__ == '__main__': > + silenttestrunner.main(__name__) > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@selenic.com > http://selenic.com/mailman/listinfo/mercurial-devel >
Patch
diff --git a/mercurial/manifest.c b/mercurial/manifest.c new file mode 100644 --- /dev/null +++ b/mercurial/manifest.c @@ -0,0 +1,841 @@ +/* + * manifest.c - manifest type that does on-demand parsing. + * + * Copyright 2015, Google Inc. + * + * This software may be used and distributed according to the terms of + * the GNU General Public License, incorporated herein by reference. + */ +#include <assert.h> +#include <string.h> +#include <stdlib.h> + +#include <Python.h> + +/* VC9 doesn't include bool and lacks stdbool.h based on my searching */ +#ifdef _MSC_VER +#define true 1 +#define false 0 +typedef unsigned char bool; +#else +#include <stdbool.h> +#endif + +#define DEFAULT_LINES 100000 + +typedef struct { + char *start; + Py_ssize_t len; /* length of line including terminal newline */ + char hash_suffix; + bool from_malloc; + bool deleted; +} line; + +typedef struct { + PyObject_HEAD + PyObject *pydata; + line *lines; + int numlines; /* number of line entries */ + int livelines; /* number of non-deleted lines */ + int maxlines; /* allocated number of lines */ + bool dirty; +} lazymanifest; + +#define MANIFEST_OOM -1 +#define MANIFEST_NOT_SORTED -2 +#define MANIFEST_MALFORMED -3 + +/* defined in parsers.c */ +PyObject *unhexlify(const char *str, int len); + +/* get the length of the path for a line */ +static size_t pathlen(line *l) { + return strlen(l->start); +} + +/* get the node value of a single line */ +static PyObject *nodeof(line *l) { + char *s = l->start; + ssize_t llen = pathlen(l); + PyObject *hash = unhexlify(s + llen + 1, 40); + if (!hash) { + return NULL; + } + if (l->hash_suffix != '\0') { + char newhash[21]; + memcpy(newhash, PyString_AsString(hash), 20); + Py_DECREF(hash); + newhash[20] = l->hash_suffix; + hash = PyString_FromStringAndSize(newhash, 21); + } + return hash; +} + +/* get the node hash and flags of a line as a tuple */ +static PyObject *hashflags(line *l) +{ + char *s = l->start; + size_t plen = pathlen(l); + PyObject *hash = nodeof(l); + + /* 40 for hash, 1 for null byte, 1 for newline */ + size_t hplen = plen + 42; + Py_ssize_t flen = l->len - hplen; + PyObject *flags; + PyObject *tup; + + if (!hash) + return NULL; + flags = PyString_FromStringAndSize(s + hplen - 1, flen); + if (!flags) { + Py_DECREF(hash); + return NULL; + } + tup = PyTuple_Pack(2, hash, flags); + Py_DECREF(flags); + Py_DECREF(hash); + return tup; +} + +/* if we're about to run out of space in the line index, add more */ +static bool realloc_if_full(lazymanifest *self) +{ + if (self->numlines == self->maxlines) { + self->maxlines *= 2; + self->lines = realloc(self->lines, self->maxlines * sizeof(line)); + } + return self->lines; +} + +/* + * Find the line boundaries in the manifest that 'data' points to and store + * information about each line in 'self'. + */ +static int find_lines(lazymanifest *self, char *data, Py_ssize_t len) +{ + char *prev = NULL; + while (len > 0) { + line *l; + char *next = memchr(data, '\n', len); + if (!next) { + return MANIFEST_MALFORMED; + } + next++; /* advance past newline */ + if (!realloc_if_full(self)) { + return MANIFEST_OOM; /* no memory */ + } + if (prev && strcmp(prev, data) > -1) { + /* This data isn't sorted, so we have to abort. */ + return MANIFEST_NOT_SORTED; + } + l = self->lines + ((self->numlines)++); + l->start = data; + l->len = next - data; + l->hash_suffix = '\0'; + l->from_malloc = false; + l->deleted = false; + len = len - l->len; + prev = data; + data = next; + } + self->livelines = self->numlines; + return 0; +} + +static int lazymanifest_init(lazymanifest *self, PyObject *args) +{ + char *data; + Py_ssize_t len; + int err, ret; + PyObject *pydata; + if (!PyArg_ParseTuple(args, "S", &pydata)) { + return -1; + } + err = PyString_AsStringAndSize(pydata, &data, &len); + + self->dirty = false; + if (err == -1) + return -1; + self->pydata = pydata; + Py_INCREF(self->pydata); + Py_BEGIN_ALLOW_THREADS + self->lines = malloc(DEFAULT_LINES * sizeof(line)); + self->maxlines = DEFAULT_LINES; + self->numlines = 0; + if (!self->lines) + ret = -2; + else + ret = find_lines(self, data, len); + Py_END_ALLOW_THREADS + switch (ret) { + case 0: + break; + case MANIFEST_OOM: + PyErr_NoMemory(); + break; + case MANIFEST_NOT_SORTED: + PyErr_Format(PyExc_ValueError, + "Manifest lines not in sorted order."); + break; + case MANIFEST_MALFORMED: + PyErr_Format(PyExc_ValueError, + "Manifest did not end in a newline."); + break; + default: + PyErr_Format(PyExc_ValueError, + "Unknown problem parsing manifest."); + } + return ret == 0 ? 0 : -1; +} + +static void lazymanifest_dealloc(lazymanifest *self) +{ + /* free any extra lines we had to allocate */ + int i; + for (i = 0; i < self->numlines; i++) { + if (self->lines[i].from_malloc) { + free(self->lines[i].start); + } + } + if (self->lines) { + free(self->lines); + self->lines = NULL; + } + if (self->pydata) { + Py_DECREF(self->pydata); + self->pydata = NULL; + } + PyObject_Del(self); +} + +/* iteration support */ + +typedef struct { + PyObject_HEAD lazymanifest *m; + Py_ssize_t pos; +} lmIter; + +static void lmiter_dealloc(PyObject *o) +{ + lmIter *self = (lmIter *)o; + Py_DECREF(self->m); + PyObject_Del(self); +} + +static PyObject *lmiter_iternext(PyObject *o) +{ + size_t pl; + line *l; + Py_ssize_t consumed; + PyObject *ret = NULL, *path = NULL, *hash = NULL, *flags = NULL; + lmIter *self = (lmIter *)o; + do { + self->pos++; + if (self->pos >= self->m->numlines) { + goto bail; + } + /* skip over deleted manifest entries */ + } while (self->m->lines[self->pos].deleted); + l = self->m->lines + self->pos; + pl = pathlen(l); + path = PyString_FromStringAndSize(l->start, pl); + hash = nodeof(l); + consumed = pl + 41; + flags = PyString_FromStringAndSize(l->start + consumed, + l->len - consumed - 1); + if (!flags) { + goto bail; + } + ret = PyTuple_Pack(3, path, hash, flags); + bail: + Py_XDECREF(path); + Py_XDECREF(hash); + Py_XDECREF(flags); + return ret; +} + +static PyTypeObject lazymanifestIterator = { + PyObject_HEAD_INIT(NULL) + 0, /*ob_size */ + "parsers.lazymanifest.iterator", /*tp_name */ + sizeof(lmIter), /*tp_basicsize */ + 0, /*tp_itemsize */ + lmiter_dealloc, /*tp_dealloc */ + 0, /*tp_print */ + 0, /*tp_getattr */ + 0, /*tp_setattr */ + 0, /*tp_compare */ + 0, /*tp_repr */ + 0, /*tp_as_number */ + 0, /*tp_as_sequence */ + 0, /*tp_as_mapping */ + 0, /*tp_hash */ + 0, /*tp_call */ + 0, /*tp_str */ + 0, /*tp_getattro */ + 0, /*tp_setattro */ + 0, /*tp_as_buffer */ + /* tp_flags: Py_TPFLAGS_HAVE_ITER tells python to + use tp_iter and tp_iternext fields. */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_ITER, + "Iterator for a lazymanifest.", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter: __iter__() method */ + lmiter_iternext, /* tp_iternext: next() method */ +}; + +static lazymanifest *lazymanifest_copy(lazymanifest *self); + +static PyObject *lazymanifest_getiter(lazymanifest *self) +{ + lmIter *i = NULL; + lazymanifest *t = lazymanifest_copy(self); + if (!t) { + PyErr_NoMemory(); + return NULL; + } + i = PyObject_New(lmIter, &lazymanifestIterator); + if (i) { + i->m = t; + i->pos = -1; + } else { + Py_DECREF(t); + PyErr_NoMemory(); + } + return (PyObject *)i; +} + +/* __getitem__ and __setitem__ support */ + +static Py_ssize_t lazymanifest_size(lazymanifest *self) +{ + return self->livelines; +} + +static int linecmp(const void *left, const void *right) +{ + return strcmp(((const line *)left)->start, + ((const line *)right)->start); +} + +static PyObject *lazymanifest_getitem(lazymanifest *self, PyObject *key) +{ + line needle; + line *hit; + if (!PyString_Check(key)) { + PyErr_Format(PyExc_TypeError, + "getitem: manifest keys must be a string."); + return NULL; + } + needle.start = PyString_AsString(key); + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), + &linecmp); + if (!hit || hit->deleted) { + PyErr_Format(PyExc_KeyError, "No such manifest entry."); + return NULL; + } + return hashflags(hit); +} + +static int lazymanifest_delitem(lazymanifest *self, PyObject *key) +{ + line needle; + line *hit; + if (!PyString_Check(key)) { + PyErr_Format(PyExc_TypeError, + "delitem: manifest keys must be a string."); + return -1; + } + needle.start = PyString_AsString(key); + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), + &linecmp); + if (!hit || hit->deleted) { + PyErr_Format(PyExc_KeyError, + "Tried to delete nonexistent manifest entry."); + return -1; + } + self->dirty = true; + hit->deleted = true; + self->livelines--; + return 0; +} + +static int lazymanifest_setitem( + lazymanifest *self, PyObject *key, PyObject *value) +{ + char *path; + Py_ssize_t plen; + PyObject *pyhash; + Py_ssize_t hlen; + char *hash; + PyObject *pyflags; + char *flags; + Py_ssize_t flen; + size_t dlen; + char *dest; + int i; + line new; + line *hit; + if (!PyString_Check(key)) { + PyErr_Format(PyExc_TypeError, + "setitem: manifest keys must be a string."); + return -1; + } + if (!value) { + return lazymanifest_delitem(self, key); + } + if (!PyTuple_Check(value) || PyTuple_Size(value) != 2) { + PyErr_Format(PyExc_TypeError, + "Manifest values must be a tuple of (node, flags)."); + return -1; + } + if (PyString_AsStringAndSize(key, &path, &plen) == -1) { + return -1; + } + + pyhash = PyTuple_GetItem(value, 0); + if (!PyString_Check(pyhash)) { + PyErr_Format(PyExc_TypeError, + "node must be a 20-byte string"); + return -1; + } + hlen = PyString_Size(pyhash); + /* Some parts of the codebase try and set 21 or 22 + * byte "hash" values in order to perturb things for + * status. We have to preserve at least the 21st + * byte. Sigh. If there's a 22nd byte, we drop it on + * the floor, which works fine. + */ + if (hlen != 20 && hlen != 21 && hlen != 22) { + PyErr_Format(PyExc_TypeError, + "node must be a 20-byte string"); + return -1; + } + hash = PyString_AsString(pyhash); + + pyflags = PyTuple_GetItem(value, 1); + if (!PyString_Check(pyflags) || PyString_Size(pyflags) > 1) { + PyErr_Format(PyExc_TypeError, + "flags must a 0 or 1 byte string"); + return -1; + } + if (PyString_AsStringAndSize(pyflags, &flags, &flen) == -1) { + return -1; + } + /* one null byte and one newline */ + dlen = plen + 41 + flen + 1; + dest = malloc(dlen); + if (!dest) { + PyErr_NoMemory(); + return -1; + } + memcpy(dest, path, plen + 1); + for (i = 0; i < 20; i++) { + sprintf(dest + plen + 1 + (i * 2), "%02hhx", hash[i]); + } + memcpy(dest + plen + 41, flags, flen); + dest[plen + 41 + flen] = '\n'; + new.start = dest; + new.len = dlen; + new.hash_suffix = '\0'; + if (hlen > 20) { + new.hash_suffix = hash[20]; + } + new.from_malloc = true; /* is `start` a pointer we allocated? */ + new.deleted = false; /* is this entry deleted? */ + hit = bsearch(&new, self->lines, self->numlines, + sizeof(line), &linecmp); + self->dirty = true; + if (hit) { + /* updating a line we already had */ + if (hit->from_malloc) { + free(hit->start); + } + if (hit->deleted) { + self->livelines++; + } + *hit = new; + } else { + /* new line */ + if (!realloc_if_full(self)) { + PyErr_NoMemory(); + return -1; + } + self->lines[self->numlines++] = new; + self->livelines++; + /* TODO: do a binary search and insert rather than + * append and qsort. Also offer a batch-insert + * interface, which should be a nice little + * performance win. + */ + qsort(self->lines, self->numlines, sizeof(line), &linecmp); + } + return 0; +} + +static PyMappingMethods lazymanifest_mapping_methods = { + (lenfunc)lazymanifest_size, /* mp_length */ + (binaryfunc)lazymanifest_getitem, /* mp_subscript */ + (objobjargproc)lazymanifest_setitem, /* mp_ass_subscript */ +}; + +/* sequence methods (important or __contains__ builds an iterator */ + +static int lazymanifest_contains(lazymanifest *self, PyObject *key) +{ + line needle; + line *hit; + if (!PyString_Check(key)) { + /* Our keys are always strings, so if the contains + * check is for a non-string, just return false. */ + return 0; + } + needle.start = PyString_AsString(key); + hit = bsearch(&needle, self->lines, self->numlines, sizeof(line), + &linecmp); + if (!hit || hit->deleted) { + return 0; + } + return 1; +} + +static PySequenceMethods lazymanifest_seq_meths = { + (lenfunc)lazymanifest_size, /* sq_length */ + 0, /* sq_concat */ + 0, /* sq_repeat */ + 0, /* sq_item */ + 0, /* sq_slice */ + 0, /* sq_ass_item */ + 0, /* sq_ass_slice */ + (objobjproc)lazymanifest_contains, /* sq_contains */ + 0, /* sq_inplace_concat */ + 0, /* sq_inplace_repeat */ +}; + + +/* Other methods (copy, diff, etc) */ +static PyTypeObject lazymanifestType; + +/* If the manifest has changes, build the new manifest text and reindex it. */ +static int compact(lazymanifest *self) { + int i; + ssize_t need = 0; + char *data; + line *src, *dst; + PyObject *pydata; + if (!self->dirty) + return 0; + for (i = 0; i < self->numlines; i++) { + if (!self->lines[i].deleted) { + need += self->lines[i].len; + } + } + pydata = PyString_FromStringAndSize(NULL, need); + if (!pydata) + return -1; + data = PyString_AsString(pydata); + if (!data) { + return -1; + } + src = self->lines; + dst = self->lines; + for (i = 0; i < self->numlines; i++, src++) { + char *tofree = NULL; + if (src->from_malloc) { + tofree = src->start; + } + if (!src->deleted) { + memcpy(data, src->start, src->len); + *dst = *src; + dst->start = data; + dst->from_malloc = false; + data += dst->len; + dst++; + } + free(tofree); + } + Py_DECREF(self->pydata); + self->pydata = pydata; + self->numlines = self->livelines; + self->dirty = false; + return 0; +} + +static PyObject *lazymanifest_text(lazymanifest *self) +{ + if (compact(self) != 0) { + PyErr_NoMemory(); + return NULL; + } + Py_INCREF(self->pydata); + return self->pydata; +} + +static lazymanifest *lazymanifest_copy(lazymanifest *self) +{ + lazymanifest *copy = NULL; + if (compact(self) != 0) { + goto nomem; + } + copy = PyObject_New(lazymanifest, &lazymanifestType); + if (!copy) { + goto nomem; + } + copy->numlines = self->numlines; + copy->livelines = self->livelines; + copy->dirty = false; + copy->lines = malloc(self->maxlines *sizeof(line)); + if (!copy->lines) { + goto nomem; + } + memcpy(copy->lines, self->lines, self->numlines * sizeof(line)); + copy->maxlines = self->maxlines; + copy->pydata = self->pydata; + Py_INCREF(copy->pydata); + return copy; + nomem: + PyErr_NoMemory(); + Py_XDECREF(copy); + return NULL; +} + +static lazymanifest *lazymanifest_filtercopy( + lazymanifest *self, PyObject *matchfn) +{ + lazymanifest *copy = NULL; + int i; + if (!PyCallable_Check(matchfn)) { + PyErr_SetString(PyExc_TypeError, "matchfn must be callable"); + return NULL; + } + /* compact ourselves first to avoid double-frees later when we + * compact tmp so that it doesn't have random pointers to our + * underlying from_malloc-data (self->pydata is safe) */ + if (compact(self) != 0) { + goto nomem; + } + copy = PyObject_New(lazymanifest, &lazymanifestType); + copy->dirty = true; + copy->lines = malloc(self->maxlines * sizeof(line)); + if (!copy->lines) { + goto nomem; + } + copy->maxlines = self->maxlines; + copy->numlines = 0; + copy->pydata = self->pydata; + Py_INCREF(self->pydata); + for (i = 0; i < self->numlines; i++) { + PyObject *arg = PyString_FromString(self->lines[i].start); + PyObject *arglist = PyTuple_Pack(1, arg); + PyObject *result = PyObject_CallObject(matchfn, arglist); + Py_DECREF(arglist); + Py_DECREF(arg); + /* if the callback raised an exception, just let it + * through and give up */ + if (!result) { + free(copy->lines); + Py_DECREF(self->pydata); + return NULL; + } + if (PyObject_IsTrue(result)) { + assert(!(self->lines[i].from_malloc)); + copy->lines[copy->numlines++] = self->lines[i]; + } + Py_DECREF(result); + } + copy->livelines = copy->numlines; + return copy; + nomem: + PyErr_NoMemory(); + Py_XDECREF(copy); + return NULL; +} + +static PyObject *lazymanifest_diff(lazymanifest *self, PyObject *args) +{ + lazymanifest *other; + PyObject *pyclean = NULL; + bool listclean; + PyObject *emptyTup = NULL, *ret = NULL; + PyObject *es; + int sneedle = 0, oneedle = 0; + if (!PyArg_ParseTuple(args, "O!|O", &lazymanifestType, &other, &pyclean)) { + return NULL; + } + listclean = (!pyclean) ? false : PyObject_IsTrue(pyclean); + es = PyString_FromString(""); + if (!es) { + goto nomem; + } + emptyTup = PyTuple_Pack(2, Py_None, es); + Py_DECREF(es); + if (!emptyTup) { + goto nomem; + } + ret = PyDict_New(); + if (!ret) { + goto nomem; + } + while (sneedle != self->numlines || oneedle != other->numlines) { + line *left = self->lines + sneedle; + line *right = other->lines + oneedle; + int result; + PyObject *key; + PyObject *outer; + /* If we're looking at a deleted entry and it's not + * the end of the manifest, just skip it. */ + if (left->deleted && sneedle < self->numlines) { + sneedle++; + continue; + } + if (right->deleted && oneedle < other->numlines) { + oneedle++; + continue; + } + /* if we're at the end of either manifest, then we + * know the remaining items are adds so we can skip + * the strcmp. */ + if (sneedle == self->numlines) { + result = 1; + } else if (oneedle == other->numlines) { + result = -1; + } else { + result = linecmp(left, right); + } + key = result <= 0 ? + PyString_FromString(left->start) : + PyString_FromString(right->start); + if (!key) + goto nomem; + if (result < 0) { + PyObject *l = hashflags(left); + if (!l) { + goto nomem; + } + outer = PyTuple_Pack(2, l, emptyTup); + Py_DECREF(l); + if (!outer) { + goto nomem; + } + PyDict_SetItem(ret, key, outer); + Py_DECREF(outer); + sneedle++; + } else if (result > 0) { + PyObject *r = hashflags(right); + if (!r) { + goto nomem; + } + outer = PyTuple_Pack(2, emptyTup, r); + Py_DECREF(r); + if (!outer) { + goto nomem; + } + PyDict_SetItem(ret, key, outer); + Py_DECREF(outer); + oneedle++; + } else { + /* file exists in both manifests */ + if (left->len != right->len + || memcmp(left->start, right->start, left->len) + || left->hash_suffix != right->hash_suffix) { + PyObject *l = hashflags(left); + PyObject *r; + if (!l) { + goto nomem; + } + r = hashflags(right); + if (!r) { + Py_DECREF(l); + goto nomem; + } + outer = PyTuple_Pack(2, l, r); + Py_DECREF(l); + Py_DECREF(r); + if (!outer) { + goto nomem; + } + PyDict_SetItem(ret, key, outer); + Py_DECREF(outer); + } else if (listclean) { + PyDict_SetItem(ret, key, Py_None); + } + sneedle++; + oneedle++; + } + Py_DECREF(key); + } + Py_DECREF(emptyTup); + return ret; + nomem: + PyErr_NoMemory(); + Py_XDECREF(ret); + Py_XDECREF(emptyTup); + return NULL; +} + +static PyMethodDef lazymanifest_methods[] = { + {"copy", (PyCFunction)lazymanifest_copy, METH_NOARGS, + "Make a copy of this lazymanifest."}, + {"filtercopy", (PyCFunction)lazymanifest_filtercopy, METH_O, + "Make a copy of this manifest filtered by matchfn."}, + {"diff", (PyCFunction)lazymanifest_diff, METH_VARARGS, + "Compare this lazymanifest to another one."}, + {"text", (PyCFunction)lazymanifest_text, METH_NOARGS, + "Encode this manifest to text."}, + {NULL}, +}; + +static PyTypeObject lazymanifestType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "parsers.lazymanifest", /* tp_name */ + sizeof(lazymanifest), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)lazymanifest_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + &lazymanifest_seq_meths, /* tp_as_sequence */ + &lazymanifest_mapping_methods, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_SEQUENCE_IN, /* tp_flags */ + "TODO(augie)", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + (getiterfunc)lazymanifest_getiter, /* tp_iter */ + 0, /* tp_iternext */ + lazymanifest_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)lazymanifest_init, /* tp_init */ + 0, /* tp_alloc */ +}; + +void manifest_module_init(PyObject * mod) +{ + lazymanifestType.tp_new = PyType_GenericNew; + if (PyType_Ready(&lazymanifestType) < 0) + return; + Py_INCREF(&lazymanifestType); + + PyModule_AddObject(mod, "lazymanifest", + (PyObject *)&lazymanifestType); +} diff --git a/mercurial/parsers.c b/mercurial/parsers.c --- a/mercurial/parsers.c +++ b/mercurial/parsers.c @@ -71,7 +71,7 @@ static inline int hexdigit(const char *p /* * Turn a hex-encoded string into binary. */ -static PyObject *unhexlify(const char *str, int len) +PyObject *unhexlify(const char *str, int len) { PyObject *ret; char *d; @@ -2318,6 +2318,7 @@ static PyMethodDef methods[] = { }; void dirs_module_init(PyObject *mod); +void manifest_module_init(PyObject *mod); static void module_init(PyObject *mod) { @@ -2332,6 +2333,7 @@ static void module_init(PyObject *mod) PyModule_AddStringConstant(mod, "versionerrortext", versionerrortext); dirs_module_init(mod); + manifest_module_init(mod); indexType.tp_new = PyType_GenericNew; if (PyType_Ready(&indexType) < 0 || diff --git a/setup.py b/setup.py --- a/setup.py +++ b/setup.py @@ -493,6 +493,7 @@ extmodules = [ Extension('mercurial.mpatch', ['mercurial/mpatch.c'], depends=common_depends), Extension('mercurial.parsers', ['mercurial/dirs.c', + 'mercurial/manifest.c', 'mercurial/parsers.c', 'mercurial/pathencode.c'], depends=common_depends), diff --git a/tests/test-manifest.py b/tests/test-manifest.py new file mode 100644 --- /dev/null +++ b/tests/test-manifest.py @@ -0,0 +1,221 @@ +import binascii +import unittest +import itertools + +import silenttestrunner + +from mercurial import parsers + +HASH_1 = '1' * 40 +HASH_2 = 'f' * 40 +HASH_3 = '1234567890abcdef0987654321deadbeef0fcafe' +A_SHORT_MANIFEST = ( + 'bar/baz/qux.py\0%(hash2)s%(flag2)s\n' + 'foo\0%(hash1)s%(flag1)s\n' + ) % {'hash1': HASH_1, + 'flag1': '', + 'hash2': HASH_2, + 'flag2': 'l', + } + +HUGE_MANIFEST_ENTRIES = 200001 + +A_HUGE_MANIFEST = ''.join(sorted( + 'file%d\0%s%s\n' % (i, h, f) for i, h, f in + itertools.izip(xrange(200001), + itertools.cycle((HASH_1, HASH_2)), + itertools.cycle(('', 'x', 'l'))))) + +class testmanifest(unittest.TestCase): + + def assertIn(self, thing, container, msg=None): + # assertIn new in 2.7, use it if available, otherwise polyfill + sup = getattr(unittest.TestCase, 'assertIn', False) + if sup: + return sup(self, thing, container, msg=msg) + if not msg: + msg = 'Expected %r in %r' % (thing, container) + self.assert_(thing in container, msg) + + def testEmptyManifest(self): + m = parsers.lazymanifest('') + self.assertEqual(0, len(m)) + self.assertEqual([], list(m)) + + def testManifest(self): + m = parsers.lazymanifest(A_SHORT_MANIFEST) + want = [ + ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'), + ('foo', binascii.unhexlify(HASH_1), ''), + ] + self.assertEqual(len(want), len(m)) + self.assertEqual(want, list(m)) + self.assertEqual((binascii.unhexlify(HASH_1), ''), m['foo']) + self.assertRaises(KeyError, lambda : m['wat']) + self.assertEqual((binascii.unhexlify(HASH_2), 'l'), + m['bar/baz/qux.py']) + + def testSetItem(self): + want = binascii.unhexlify(HASH_1), '' + + m = parsers.lazymanifest('') + m['a'] = want + self.assertIn('a', m) + self.assertEqual(want, m['a']) + self.assertEqual('a\0' + HASH_1 + '\n', m.text()) + + m = parsers.lazymanifest(A_SHORT_MANIFEST) + m['a'] = want + self.assertEqual(want, m['a']) + self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST, + m.text()) + m2 = m.copy() + del m + del m2 # make sure we don't double free() anything + + def testCompaction(self): + unhex = binascii.unhexlify + h1, h2 = unhex(HASH_1), unhex(HASH_2) + m = parsers.lazymanifest(A_SHORT_MANIFEST) + m['alpha'] = h1, '' + m['beta'] = h2, '' + del m['foo'] + want = 'alpha\0%s\nbar/baz/qux.py\0%sl\nbeta\0%s\n' % ( + HASH_1, HASH_2, HASH_2) + self.assertEqual(want, m.text()) + self.assertEqual(3, len(m)) + self.assertEqual((h1, ''), m['alpha']) + self.assertEqual((h2, ''), m['beta']) + self.assertRaises(KeyError, lambda : m['foo']) + w = [('alpha', h1, ''), ('bar/baz/qux.py', h2, 'l'), ('beta', h2, '')] + self.assertEqual(w, list(m)) + + def testSetGetNodeSuffix(self): + clean = parsers.lazymanifest(A_SHORT_MANIFEST) + m = parsers.lazymanifest(A_SHORT_MANIFEST) + h, f = m['foo'] + want = h + 'a', f + # Merge code wants to set 21-byte fake hashes at times + m['foo'] = want + self.assertEqual(want, m['foo']) + self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'), + ('foo', binascii.unhexlify(HASH_1) + 'a', '')], + list(m)) + # Sometimes it even tries a 22-byte fake hash, but we can + # return 21 and it'll work out + m['foo'] = want[0] + '+', f + self.assertEqual(want, m['foo']) + # make sure the suffix survives a copy + m2 = m.filtercopy(lambda x: x == 'foo') + self.assertEqual(want, m2['foo']) + self.assertEqual(1, len(m2)) + self.assertEqual(('foo\0%s\n' % HASH_1), m2.text()) + m2 = m.copy() + self.assertEqual(want, m2['foo']) + # suffix with iteration + self.assertEqual([('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'), + ('foo', want[0], '')], list(m)) + # shows up in diff + self.assertEqual({'foo': (want, (h, ''))}, m.diff(clean)) + self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m)) + + def testFilterCopyException(self): + m = parsers.lazymanifest(A_SHORT_MANIFEST) + def filt(path): + if path == 'foo': + assert False + return True + self.assertRaises(AssertionError, m.filtercopy, filt) + + def testRemoveItem(self): + m = parsers.lazymanifest(A_SHORT_MANIFEST) + del m['foo'] + self.assertRaises(KeyError, lambda : m['foo']) + self.assertEqual(1, len(m)) + self.assertEqual(1, len(list(m))) + + def testManifestDiff(self): + MISSING = (None, '') + addl = 'z-only-in-left\0' + HASH_1 + '\n' + addr = 'z-only-in-right\0' + HASH_2 + 'x\n' + left = parsers.lazymanifest( + A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl) + right = parsers.lazymanifest(A_SHORT_MANIFEST + addr) + want = { + 'foo': ((binascii.unhexlify(HASH_3), 'x'), + (binascii.unhexlify(HASH_1), '')), + 'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING), + 'z-only-in-right': (MISSING, (binascii.unhexlify(HASH_2), 'x')), + } + self.assertEqual(want, left.diff(right)) + + want = { + 'bar/baz/qux.py': (MISSING, (binascii.unhexlify(HASH_2), 'l')), + 'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')), + 'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')), + } + self.assertEqual(want, parsers.lazymanifest('').diff(left)) + + want = { + 'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'), MISSING), + 'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING), + 'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING), + } + self.assertEqual(want, left.diff(parsers.lazymanifest(''))) + copy = right.copy() + del copy['z-only-in-right'] + del right['foo'] + want = { + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), + 'z-only-in-right': ((binascii.unhexlify(HASH_2), 'x'), MISSING), + } + self.assertEqual(want, right.diff(copy)) + + short = parsers.lazymanifest(A_SHORT_MANIFEST) + pruned = short.copy() + del pruned['foo'] + want = { + 'foo': ((binascii.unhexlify(HASH_1), ''), MISSING), + } + self.assertEqual(want, short.diff(pruned)) + want = { + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), + } + self.assertEqual(want, pruned.diff(short)) + want = { + 'bar/baz/qux.py': None, + 'foo': (MISSING, (binascii.unhexlify(HASH_1), '')), + } + self.assertEqual(want, pruned.diff(short, True)) + + def testReversedLines(self): + backwards = ''.join( + l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if l) + try: + parsers.lazymanifest(backwards) + self.fail('Should have raised ValueError') + except ValueError, v: + self.assertIn('Manifest lines not in sorted order.', str(v)) + + def testNoTerminalNewline(self): + try: + parsers.lazymanifest(A_SHORT_MANIFEST + 'wat') + self.fail('Should have raised ValueError') + except ValueError, v: + self.assertIn('Manifest did not end in a newline.', str(v)) + + def testNoNewLineAtAll(self): + try: + parsers.lazymanifest('wat') + self.fail('Should have raised ValueError') + except ValueError, v: + self.assertIn('Manifest did not end in a newline.', str(v)) + + def testHugeManifest(self): + m = parsers.lazymanifest(A_HUGE_MANIFEST) + self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m)) + self.assertEqual(len(m), len(list(m))) + + +if __name__ == '__main__': + silenttestrunner.main(__name__)