Patchwork [rfc] manifest: write a more efficient version of lazymanifest, in pure python

login
register
mail settings
Submitter Maciej Fijalkowski
Date Aug. 20, 2016, 9:02 p.m.
Message ID <21b2401d468d6b24c165.1471726929@brick.arcode.com>
Download mbox | patch
Permalink /patch/16364/
State Changes Requested
Headers show

Comments

Maciej Fijalkowski - Aug. 20, 2016, 9:02 p.m.
# HG changeset patch
# User Maciej Fijalkowski <fijall@gmail.com>
# Date 1471726818 -7200
#      Sat Aug 20 23:00:18 2016 +0200
# Node ID 21b2401d468d6b24c1658468e4fc5ce8744f925b
# Parent  300f14ea21432face8d7e6cdcf92ba9d2f1f92dc
manifest: write a more efficient version of lazymanifest, in pure python

Questions outsdanding:
* who calls filtercopy? noone in tests at the very least
* are the performance tradeoffs ok here, notably __delitem__?
* should we use mmap instead of reading stuff from a file?
* current version does not support 21 or 22 long hashes, why are they necessary?
Augie Fackler - Aug. 25, 2016, 3:55 a.m.
> On Aug 20, 2016, at 5:02 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> 
> # HG changeset patch
> # User Maciej Fijalkowski <fijall@gmail.com>
> # Date 1471726818 -7200
> #      Sat Aug 20 23:00:18 2016 +0200
> # Node ID 21b2401d468d6b24c1658468e4fc5ce8744f925b
> # Parent  300f14ea21432face8d7e6cdcf92ba9d2f1f92dc
> manifest: write a more efficient version of lazymanifest, in pure python
> 
> Questions outsdanding:
> * who calls filtercopy? noone in tests at the very least

manifest.py line 287 or thereabouts: it’s used for matches() on a manifest. test-status-rev.t will exercise that codepath if that helps.

> * are the performance tradeoffs ok here, notably __delitem__?

Probably. It looks very similar to the C version. Deleting a ton of entries from a large manifest is probably still tragic, since it’ll be a lot of copies, but for a first pass this is already so much better than the naive version that was there...

> * should we use mmap instead of reading stuff from a file?

This I can’t answer. Maybe?

> * current version does not support 21 or 22 long hashes, why are they necessary?

There are a handful of (weird) places that do something like poke a “+” on the end of a hash in the manifest to mark it as dirty, but that is never saved to disk.

> 
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -104,68 +104,277 @@
>     _checkforbidden(files)
>     return ''.join(lines)
> 
> -class _lazymanifest(dict):
> -    """This is the pure implementation of lazymanifest.
> -
> -    It has not been optimized *at all* and is not lazy.
> -    """
> +class LazyManifestIter(object):
> +    def __init__(self, lm):
> +        self.pos = 0
> +        self.lm = lm
> 
> -    def __init__(self, data):
> -        dict.__init__(self)
> -        for f, n, fl in _parse(data):
> -            self[f] = n, fl
> +    def next(self):
> +        try:
> +            data, pos = self.lm._get(self.pos)
> +        except IndexError:
> +            raise StopIteration
> +        if pos == -1:
> +            self.pos += 1
> +            return data[0]
> +        self.pos += 1
> +        zeropos = data.find('\x00', pos)
> +        return data[pos:zeropos]
> 
> -    def __setitem__(self, k, v):
> -        node, flag = v
> -        assert node is not None
> -        if len(node) > 21:
> -            node = node[:21] # match c implementation behavior
> -        dict.__setitem__(self, k, (node, flag))
> +class LazyManifestIterEntries(object):
> +    def __init__(self, lm):
> +        self.lm = lm
> +        self.pos = 0
> 
>     def __iter__(self):
> -        return iter(sorted(dict.keys(self)))
> +        return self
> 
> -    def iterkeys(self):
> -        return iter(sorted(dict.keys(self)))
> +    def next(self):
> +        try:
> +            data, pos = self.lm._get(self.pos)
> +        except IndexError:
> +            raise StopIteration
> +        if pos == -1:
> +            self.pos += 1
> +            return data
> +        zeropos = data.find('\x00', pos)
> +        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
> +                            zeropos + 1, 40)
> +        flags = self.lm._getflags(data, self.pos, zeropos)
> +        self.pos += 1
> +        return (data[pos:zeropos], hashval, flags)
> 
> -    def iterentries(self):
> -        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
> +def unhexlify(data, extra, pos, length):
> +    s = data[pos:pos+length]
> +    if extra:
> +        s += extra & 0xff
> +    return s.decode("hex")
> +
> +class _lazymanifest(object):
> +    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
> +        if positions is None:
> +            self.positions = self.find_lines(data)
> +            self.extrainfo = [0] * len(self.positions)
> +            self.data = data
> +            self.extradata = []
> +        else:
> +            self.positions = positions[:]
> +            self.extrainfo = extrainfo[:]
> +            self.extradata = extradata[:]
> +            self.data = data
> +
> +    def find_lines(self, data):
> +        if not data:
> +            return []
> +        pos = data.find("\n")
> +        positions = [0]
> +        while pos < len(data) - 1 and pos != -1:
> +            positions.append(pos + 1)
> +            pos = data.find("\n", pos + 1)
> +        return positions
> +
> +    def _get(self, index):
> +        # get the position encoded in pos:
> +        #   positive number is an index in 'data'
> +        #   negative number is in extrapieces
> +        pos = self.positions[index]
> +        if pos >= 0:
> +            return self.data, pos
> +        return self.extradata[-pos-1], -1
> +
> +    def _getkey(self, pos):
> +        if pos >= 0:
> +            return self.data[pos:self.data.find('\x00', pos + 1)]
> +        return self.extradata[-pos-1][0]
> +
> +    def bsearch(self, key):
> +        first = 0
> +        last = len(self.positions) - 1
> +
> +        while first <= last:
> +            midpoint = (first + last)//2
> +            nextpos = self.positions[midpoint]
> +            candidate = self._getkey(nextpos)
> +            r = cmp(key, candidate)
> +            if r == 0:
> +                return midpoint
> +            else:
> +                if r < 0:
> +                    last = midpoint - 1
> +                else:
> +                    first = midpoint + 1
> +        return -1
> +
> +    def bsearch2(self, key):
> +        # same as the above, but will always return the position
> +        # done for performance reasons
> +        first = 0
> +        last = len(self.positions) - 1
> +
> +        while first <= last:
> +            midpoint = (first + last)//2
> +            nextpos = self.positions[midpoint]
> +            candidate = self._getkey(nextpos)
> +            r = cmp(key, candidate)
> +            if r == 0:
> +                return (midpoint, True)
> +            else:
> +                if r < 0:
> +                    last = midpoint - 1
> +                else:
> +                    first = midpoint + 1
> +        return (first, False)
> +
> +    def __contains__(self, key):
> +        return self.bsearch(key) != -1
> +
> +    def _getflags(self, data, needle, pos):
> +        start = pos + 41
> +        if needle < 0 or needle == len(self.positions) - 1:
> +            end = len(data) - 1
> +        else:
> +            end = data.find("\n", start)
> +        if start == end:
> +            return ''
> +        return self.data[start:end]
> +
> +    def __getitem__(self, key):
> +        if not isinstance(key, str):
> +            raise TypeError("setitem: manifest keys must be a string.")
> +        needle = self.bsearch(key)
> +        if needle == -1:
> +            raise KeyError
> +        data, pos = self._get(needle)
> +        if pos == -1:
> +            return (data[1], data[2])
> +        zeropos = data.find('\x00', pos)
> +        assert 0 <= needle <= len(self.positions)
> +        assert len(self.extrainfo) == len(self.positions)
> +        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
> +        flags = self._getflags(data, needle, zeropos)
> +        return (hashval, flags)
> +
> +    def __delitem__(self, key):
> +        needle, found = self.bsearch2(key)
> +        if not found:
> +            raise KeyError
> +        cur = self.positions[needle]
> +        self.positions = self.positions[:needle] + self.positions[needle + 1:]
> +        if cur >= 0:
> +            lastpos = self.data.find("\n", cur)
> +            if lastpos == -1:
> +                self.data = self.data[:cur]
> +            else:
> +                self.data = self.data[:cur] + self.data[lastpos:]
> +
> +    def __setitem__(self, key, value):
> +        if not isinstance(key, str):
> +            raise TypeError("setitem: manifest keys must be a string.")
> +        if not isinstance(value, tuple) or len(value) != 2:
> +            raise TypeError("Manifest values must be a tuple of (node, flags).")
> +        hashval = value[0]
> +        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
> +            raise TypeError("node must be a 20-byte string")
> +        if len(hashval) != 20:
> +            raise TypeError
> +        flags = value[1]
> +        if not isinstance(flags, str) or len(flags) > 1:
> +            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
> +        needle, found = self.bsearch2(key)
> +        if found:
> +            # put the item
> +            pos = self.positions[needle]
> +            if pos < 0:
> +                self.extradata[-pos-1] = (key, value[0], value[1])
> +            else:
> +                # just don't bother
> +                self.extradata.append((key, value[0], value[1]))
> +                self.positions[needle] = -len(self.extradata)
> +        else:
> +            # not found, put it in with extra positions
> +            self.extradata.append((key, value[0], value[1]))
> +            self.positions = self.positions[:needle] + [-len(self.extradata)] + self.positions[needle:]
> +            self.extrainfo = self.extrainfo[:needle] + [0] + self.extrainfo[needle:]
> 
>     def copy(self):
> -        c = _lazymanifest('')
> -        c.update(self)
> -        return c
> +        # XXX call _compact like in C?
> +        return _lazymanifest(self.data, self.positions, self.extrainfo,
> +            self.extradata)
> +
> +    def _compact(self):
> +        # hopefully not called TOO often
> +        def find_next_position(positions, start, lgt):
> +            i = start
> +            while i < len(positions):
> +                if positions[i] >= 0:
> +                    return positions[i]
> +                i += 1
> +            return lgt
> +
> +        if len(self.extradata) == 0:
> +            return
> +        l = []
> +        last_cut = 0
> +        i = 0
> +        offset = 0
> +        while i < len(self.positions):
> +            if self.positions[i] >= 0:
> +                cur = self.positions[i]
> +                last_cut = cur
> +                while True:
> +                    self.positions[i] = offset
> +                    i += 1
> +                    if i == len(self.positions) or self.positions[i] < 0:
> +                        break
> +                    offset += self.positions[i] - cur
> +                    cur = self.positions[i]
> +                end_cut = find_next_position(self.positions, i, len(self.data))
> +                offset += end_cut - cur
> +                l.append(self.data[last_cut:end_cut])
> +            else:
> +                while i < len(self.positions) and self.positions[i] < 0:
> +                    cur = self.positions[i]
> +                    l.append(self._pack(self.extradata[-cur-1]))
> +                    self.positions[i] = offset
> +                    offset += len(l[-1])
> +                    i += 1
> +        self.data = ''.join(l)
> +        self.extradata = []
> +        self.extrainfo = [0] * len(self.positions) # XXX not true that it's always full of zeros
> +
> +    def _pack(self, d):
> +        return d[0] + '\x00' + d[1].encode('hex') + d[2] + '\n'
> +
> +    def text(self):
> +        self._compact()
> +        return self.data
> 
>     def diff(self, m2, clean=False):
>         '''Finds changes between the current manifest and m2.'''
> +        # XXX think whether efficiency matters here
>         diff = {}
> 
> -        for fn, e1 in self.iteritems():
> +        for fn, e1, flags in self.iterentries():
>             if fn not in m2:
> -                diff[fn] = e1, (None, '')
> +                diff[fn] = (e1, flags), (None, '')
>             else:
>                 e2 = m2[fn]
>                 if e1 != e2:
> -                    diff[fn] = e1, e2
> +                    diff[fn] = (e1, flags), e2
>                 elif clean:
>                     diff[fn] = None
> 
> -        for fn, e2 in m2.iteritems():
> +        for fn, e2, flags in m2.iterentries():
>             if fn not in self:
> -                diff[fn] = (None, ''), e2
> +                diff[fn] = (None, ''), (e2, flags)
> 
>         return diff
> 
> -    def filtercopy(self, filterfn):
> -        c = _lazymanifest('')
> -        for f, n, fl in self.iterentries():
> -            if filterfn(f):
> -                c[f] = n, fl
> -        return c
> +    def iterentries(self):
> +        return LazyManifestIterEntries(self)
> 
> -    def text(self):
> -        """Get the full data of this manifest as a bytestring."""
> -        return _textv1(self.iterentries())
> +    def __iter__(self):
> +        return LazyManifestIter(self)
> 
> try:
>     _lazymanifest = parsers.lazymanifest
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Matt Mackall - Aug. 25, 2016, 11 p.m.
On Wed, 2016-08-24 at 23:55 -0400, Augie Fackler wrote:
> > 
> > On Aug 20, 2016, at 5:02 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> > 
> > # HG changeset patch
> > # User Maciej Fijalkowski <fijall@gmail.com>
> > # Date 1471726818 -7200
> > #      Sat Aug 20 23:00:18 2016 +0200
> > # Node ID 21b2401d468d6b24c1658468e4fc5ce8744f925b
> > # Parent  300f14ea21432face8d7e6cdcf92ba9d2f1f92dc
> > manifest: write a more efficient version of lazymanifest, in pure python
> > 
> > Questions outsdanding:
> > * who calls filtercopy? noone in tests at the very least
> manifest.py line 287 or thereabouts: it’s used for matches() on a manifest.
> test-status-rev.t will exercise that codepath if that helps.
> 
> > 
> > * are the performance tradeoffs ok here, notably __delitem__?
> Probably. It looks very similar to the C version. Deleting a ton of entries
> from a large manifest is probably still tragic, since it’ll be a lot of
> copies, but for a first pass this is already so much better than the naive
> version that was there...
> 
> > 
> > * should we use mmap instead of reading stuff from a file?
> This I can’t answer. Maybe?
> 
> > 
> > * current version does not support 21 or 22 long hashes, why are they
> > necessary?
> There are a handful of (weird) places that do something like poke a “+” on the
> end of a hash in the manifest to mark it as dirty, but that is never saved to
> disk.

FWIW, we could probably replace the "+" hack with an "C"*40 hack. The SHA1 space
is big enough that we can punch some other small holes in it in addition to the
null hash.

-- 
Mathematics is the supreme nostalgia of our time.
Sean Farley - Aug. 26, 2016, 11:57 p.m.
Matt Mackall <mpm@selenic.com> writes:

> On Wed, 2016-08-24 at 23:55 -0400, Augie Fackler wrote:
>> > 
>> > On Aug 20, 2016, at 5:02 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
>> > 
>> > # HG changeset patch
>> > # User Maciej Fijalkowski <fijall@gmail.com>
>> > # Date 1471726818 -7200
>> > #      Sat Aug 20 23:00:18 2016 +0200
>> > # Node ID 21b2401d468d6b24c1658468e4fc5ce8744f925b
>> > # Parent  300f14ea21432face8d7e6cdcf92ba9d2f1f92dc
>> > manifest: write a more efficient version of lazymanifest, in pure python
>> > 
>> > Questions outsdanding:
>> > * who calls filtercopy? noone in tests at the very least
>> manifest.py line 287 or thereabouts: it’s used for matches() on a manifest.
>> test-status-rev.t will exercise that codepath if that helps.
>> 
>> > 
>> > * are the performance tradeoffs ok here, notably __delitem__?
>> Probably. It looks very similar to the C version. Deleting a ton of entries
>> from a large manifest is probably still tragic, since it’ll be a lot of
>> copies, but for a first pass this is already so much better than the naive
>> version that was there...
>> 
>> > 
>> > * should we use mmap instead of reading stuff from a file?
>> This I can’t answer. Maybe?
>> 
>> > 
>> > * current version does not support 21 or 22 long hashes, why are they
>> > necessary?
>> There are a handful of (weird) places that do something like poke a “+” on the
>> end of a hash in the manifest to mark it as dirty, but that is never saved to
>> disk.
>
> FWIW, we could probably replace the "+" hack with an "C"*40 hack. The SHA1 space
> is big enough that we can punch some other small holes in it in addition to the
> null hash.

I thought Yuya was working on adding "F"*40 (unless I missed something
special about "C"?)?
Yuya Nishihara - Aug. 27, 2016, 2:21 p.m.
On Fri, 26 Aug 2016 16:57:25 -0700, Sean Farley wrote:
> Matt Mackall <mpm@selenic.com> writes:
> > On Wed, 2016-08-24 at 23:55 -0400, Augie Fackler wrote:
> >> > On Aug 20, 2016, at 5:02 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> >> > * current version does not support 21 or 22 long hashes, why are they
> >> > necessary?
> >> There are a handful of (weird) places that do something like poke a “+” on the
> >> end of a hash in the manifest to mark it as dirty, but that is never saved to
> >> disk.
> >
> > FWIW, we could probably replace the "+" hack with an "C"*40 hack. The SHA1 space
> > is big enough that we can punch some other small holes in it in addition to the
> > null hash.
> 
> I thought Yuya was working on adding "F"*40 (unless I missed something
> special about "C"?)?

Yes, "F"*40 for wdir revision. I have experimental patches. Is that related to
the "+" hack in manifest?
Yuya Nishihara - Aug. 27, 2016, 2:23 p.m.
On Wed, 24 Aug 2016 23:55:30 -0400, Augie Fackler wrote:
> > On Aug 20, 2016, at 5:02 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> > * should we use mmap instead of reading stuff from a file?
> 
> This I can’t answer. Maybe?

mmap might cause a problem if manifest is stripped by another process, but
I don't know if it is true for lazymanifest.
Matt Mackall - Aug. 27, 2016, 11:08 p.m.
On Fri, 2016-08-26 at 16:57 -0700, Sean Farley wrote:
> Matt Mackall <mpm@selenic.com> writes:
> 
> > 
> > On Wed, 2016-08-24 at 23:55 -0400, Augie Fackler wrote:
> > > 
> > > > 
> > > > 
> > > > On Aug 20, 2016, at 5:02 PM, Maciej Fijalkowski <fijall@gmail.com>
> > > > wrote:
> > > > 
> > > > # HG changeset patch
> > > > # User Maciej Fijalkowski <fijall@gmail.com>
> > > > # Date 1471726818 -7200
> > > > #      Sat Aug 20 23:00:18 2016 +0200
> > > > # Node ID 21b2401d468d6b24c1658468e4fc5ce8744f925b
> > > > # Parent  300f14ea21432face8d7e6cdcf92ba9d2f1f92dc
> > > > manifest: write a more efficient version of lazymanifest, in pure python
> > > > 
> > > > Questions outsdanding:
> > > > * who calls filtercopy? noone in tests at the very least
> > > manifest.py line 287 or thereabouts: it’s used for matches() on a
> > > manifest.
> > > test-status-rev.t will exercise that codepath if that helps.
> > > 
> > > > 
> > > > 
> > > > * are the performance tradeoffs ok here, notably __delitem__?
> > > Probably. It looks very similar to the C version. Deleting a ton of
> > > entries
> > > from a large manifest is probably still tragic, since it’ll be a lot of
> > > copies, but for a first pass this is already so much better than the naive
> > > version that was there...
> > > 
> > > > 
> > > > 
> > > > * should we use mmap instead of reading stuff from a file?
> > > This I can’t answer. Maybe?
> > > 
> > > > 
> > > > 
> > > > * current version does not support 21 or 22 long hashes, why are they
> > > > necessary?
> > > There are a handful of (weird) places that do something like poke a “+” on
> > > the
> > > end of a hash in the manifest to mark it as dirty, but that is never saved
> > > to
> > > disk.
> > FWIW, we could probably replace the "+" hack with an "C"*40 hack. The SHA1
> > space
> > is big enough that we can punch some other small holes in it in addition to
> > the
> > null hash.
> I thought Yuya was working on adding "F"*40 (unless I missed something
> special about "C"?)?

It's just a number with a 2^-160 probability of collision with a real hash.
FFFF... was chosen as a pseudo-hash for the working copy pseudo-changeset to
contrast with the all-zeros representation of nullid. CCCCC... is simply a
(confusing) mnemonic for "changed" since M isn't available.

Having the pseudo-hashes for the file-level and changeset-level be the same is
slightly risky since we have nothing in Python that prevents us from ever
comparing these different types. But it has the upside that they have similar
interpretations.
Matt Mackall - Aug. 29, 2016, 7:46 p.m.
On Fri, 2016-08-26 at 16:57 -0700, Sean Farley wrote:
> Matt Mackall <mpm@selenic.com> writes:
> 
> > 
> > On Wed, 2016-08-24 at 23:55 -0400, Augie Fackler wrote:
> > > 
> > > > 
> > > > 
> > > > On Aug 20, 2016, at 5:02 PM, Maciej Fijalkowski <fijall@gmail.com>
> > > > wrote:
> > > > 
> > > > # HG changeset patch
> > > > # User Maciej Fijalkowski <fijall@gmail.com>
> > > > # Date 1471726818 -7200
> > > > #      Sat Aug 20 23:00:18 2016 +0200
> > > > # Node ID 21b2401d468d6b24c1658468e4fc5ce8744f925b
> > > > # Parent  300f14ea21432face8d7e6cdcf92ba9d2f1f92dc
> > > > manifest: write a more efficient version of lazymanifest, in pure python
> > > > 
> > > > Questions outsdanding:
> > > > * who calls filtercopy? noone in tests at the very least
> > > manifest.py line 287 or thereabouts: it’s used for matches() on a
> > > manifest.
> > > test-status-rev.t will exercise that codepath if that helps.
> > > 
> > > > 
> > > > 
> > > > * are the performance tradeoffs ok here, notably __delitem__?
> > > Probably. It looks very similar to the C version. Deleting a ton of
> > > entries
> > > from a large manifest is probably still tragic, since it’ll be a lot of
> > > copies, but for a first pass this is already so much better than the naive
> > > version that was there...
> > > 
> > > > 
> > > > 
> > > > * should we use mmap instead of reading stuff from a file?
> > > This I can’t answer. Maybe?
> > > 
> > > > 
> > > > 
> > > > * current version does not support 21 or 22 long hashes, why are they
> > > > necessary?
> > > There are a handful of (weird) places that do something like poke a “+” on
> > > the
> > > end of a hash in the manifest to mark it as dirty, but that is never saved
> > > to
> > > disk.
> > FWIW, we could probably replace the "+" hack with an "C"*40 hack. The SHA1
> > space
> > is big enough that we can punch some other small holes in it in addition to
> > the
> > null hash.
> I thought Yuya was working on adding "F"*40 (unless I missed something
> special about "C"?)?

It's just a number with a 2^-160 probability of collision with a real hash.
FFFF... was chosen as a pseudo-hash for the working copy pseudo-changeset to
contrast with the all-zeros representation of nullid. CCCCC... is simply a
(confusing) mnemonic for "changed" since M isn't available.

Having the pseudo-hashes for the file-level and changeset-level be the same is
slightly risky since we have nothing in Python that prevents us from ever
comparing these different types. But it has the upside that they have similar
interpretations.

Patch

diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -104,68 +104,277 @@ 
     _checkforbidden(files)
     return ''.join(lines)
 
-class _lazymanifest(dict):
-    """This is the pure implementation of lazymanifest.
-
-    It has not been optimized *at all* and is not lazy.
-    """
+class LazyManifestIter(object):
+    def __init__(self, lm):
+        self.pos = 0
+        self.lm = lm
 
-    def __init__(self, data):
-        dict.__init__(self)
-        for f, n, fl in _parse(data):
-            self[f] = n, fl
+    def next(self):
+        try:
+            data, pos = self.lm._get(self.pos)
+        except IndexError:
+            raise StopIteration
+        if pos == -1:
+            self.pos += 1
+            return data[0]
+        self.pos += 1
+        zeropos = data.find('\x00', pos)
+        return data[pos:zeropos]
 
-    def __setitem__(self, k, v):
-        node, flag = v
-        assert node is not None
-        if len(node) > 21:
-            node = node[:21] # match c implementation behavior
-        dict.__setitem__(self, k, (node, flag))
+class LazyManifestIterEntries(object):
+    def __init__(self, lm):
+        self.lm = lm
+        self.pos = 0
 
     def __iter__(self):
-        return iter(sorted(dict.keys(self)))
+        return self
 
-    def iterkeys(self):
-        return iter(sorted(dict.keys(self)))
+    def next(self):
+        try:
+            data, pos = self.lm._get(self.pos)
+        except IndexError:
+            raise StopIteration
+        if pos == -1:
+            self.pos += 1
+            return data
+        zeropos = data.find('\x00', pos)
+        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
+                            zeropos + 1, 40)
+        flags = self.lm._getflags(data, self.pos, zeropos)
+        self.pos += 1
+        return (data[pos:zeropos], hashval, flags)
 
-    def iterentries(self):
-        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
+def unhexlify(data, extra, pos, length):
+    s = data[pos:pos+length]
+    if extra:
+        s += extra & 0xff
+    return s.decode("hex")
+
+class _lazymanifest(object):
+    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
+        if positions is None:
+            self.positions = self.find_lines(data)
+            self.extrainfo = [0] * len(self.positions)
+            self.data = data
+            self.extradata = []
+        else:
+            self.positions = positions[:]
+            self.extrainfo = extrainfo[:]
+            self.extradata = extradata[:]
+            self.data = data
+
+    def find_lines(self, data):
+        if not data:
+            return []
+        pos = data.find("\n")
+        positions = [0]
+        while pos < len(data) - 1 and pos != -1:
+            positions.append(pos + 1)
+            pos = data.find("\n", pos + 1)
+        return positions
+
+    def _get(self, index):
+        # get the position encoded in pos:
+        #   positive number is an index in 'data'
+        #   negative number is in extrapieces
+        pos = self.positions[index]
+        if pos >= 0:
+            return self.data, pos
+        return self.extradata[-pos-1], -1
+
+    def _getkey(self, pos):
+        if pos >= 0:
+            return self.data[pos:self.data.find('\x00', pos + 1)]
+        return self.extradata[-pos-1][0]
+
+    def bsearch(self, key):
+        first = 0
+        last = len(self.positions) - 1
+   
+        while first <= last:
+            midpoint = (first + last)//2
+            nextpos = self.positions[midpoint]
+            candidate = self._getkey(nextpos)
+            r = cmp(key, candidate)
+            if r == 0:
+                return midpoint
+            else:
+                if r < 0:
+                    last = midpoint - 1
+                else:
+                    first = midpoint + 1
+        return -1
+
+    def bsearch2(self, key):
+        # same as the above, but will always return the position
+        # done for performance reasons
+        first = 0
+        last = len(self.positions) - 1
+   
+        while first <= last:
+            midpoint = (first + last)//2
+            nextpos = self.positions[midpoint]
+            candidate = self._getkey(nextpos)
+            r = cmp(key, candidate)
+            if r == 0:
+                return (midpoint, True)
+            else:
+                if r < 0:
+                    last = midpoint - 1
+                else:
+                    first = midpoint + 1
+        return (first, False)
+
+    def __contains__(self, key):
+        return self.bsearch(key) != -1
+
+    def _getflags(self, data, needle, pos):
+        start = pos + 41
+        if needle < 0 or needle == len(self.positions) - 1:
+            end = len(data) - 1
+        else:
+            end = data.find("\n", start)
+        if start == end:
+            return ''
+        return self.data[start:end]
+
+    def __getitem__(self, key):
+        if not isinstance(key, str):
+            raise TypeError("setitem: manifest keys must be a string.")
+        needle = self.bsearch(key)
+        if needle == -1:
+            raise KeyError
+        data, pos = self._get(needle)
+        if pos == -1:
+            return (data[1], data[2])
+        zeropos = data.find('\x00', pos)
+        assert 0 <= needle <= len(self.positions)
+        assert len(self.extrainfo) == len(self.positions)
+        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
+        flags = self._getflags(data, needle, zeropos)
+        return (hashval, flags)
+
+    def __delitem__(self, key):
+        needle, found = self.bsearch2(key)
+        if not found:
+            raise KeyError
+        cur = self.positions[needle]
+        self.positions = self.positions[:needle] + self.positions[needle + 1:]
+        if cur >= 0:
+            lastpos = self.data.find("\n", cur)
+            if lastpos == -1:
+                self.data = self.data[:cur]
+            else:
+                self.data = self.data[:cur] + self.data[lastpos:]
+
+    def __setitem__(self, key, value):
+        if not isinstance(key, str):
+            raise TypeError("setitem: manifest keys must be a string.")
+        if not isinstance(value, tuple) or len(value) != 2:
+            raise TypeError("Manifest values must be a tuple of (node, flags).")
+        hashval = value[0]
+        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
+            raise TypeError("node must be a 20-byte string")
+        if len(hashval) != 20:
+            raise TypeError
+        flags = value[1]
+        if not isinstance(flags, str) or len(flags) > 1:
+            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
+        needle, found = self.bsearch2(key)
+        if found:
+            # put the item
+            pos = self.positions[needle]
+            if pos < 0:
+                self.extradata[-pos-1] = (key, value[0], value[1])
+            else:
+                # just don't bother
+                self.extradata.append((key, value[0], value[1]))
+                self.positions[needle] = -len(self.extradata)
+        else:
+            # not found, put it in with extra positions
+            self.extradata.append((key, value[0], value[1]))
+            self.positions = self.positions[:needle] + [-len(self.extradata)] + self.positions[needle:]
+            self.extrainfo = self.extrainfo[:needle] + [0] + self.extrainfo[needle:]
 
     def copy(self):
-        c = _lazymanifest('')
-        c.update(self)
-        return c
+        # XXX call _compact like in C?
+        return _lazymanifest(self.data, self.positions, self.extrainfo,
+            self.extradata)
+
+    def _compact(self):
+        # hopefully not called TOO often
+        def find_next_position(positions, start, lgt):
+            i = start
+            while i < len(positions):
+                if positions[i] >= 0:
+                    return positions[i]
+                i += 1
+            return lgt
+
+        if len(self.extradata) == 0:
+            return
+        l = []
+        last_cut = 0
+        i = 0
+        offset = 0
+        while i < len(self.positions):
+            if self.positions[i] >= 0:
+                cur = self.positions[i]
+                last_cut = cur
+                while True:
+                    self.positions[i] = offset
+                    i += 1
+                    if i == len(self.positions) or self.positions[i] < 0:
+                        break
+                    offset += self.positions[i] - cur
+                    cur = self.positions[i]
+                end_cut = find_next_position(self.positions, i, len(self.data))
+                offset += end_cut - cur
+                l.append(self.data[last_cut:end_cut])
+            else:
+                while i < len(self.positions) and self.positions[i] < 0:
+                    cur = self.positions[i]
+                    l.append(self._pack(self.extradata[-cur-1]))
+                    self.positions[i] = offset
+                    offset += len(l[-1])
+                    i += 1
+        self.data = ''.join(l)
+        self.extradata = []
+        self.extrainfo = [0] * len(self.positions) # XXX not true that it's always full of zeros
+
+    def _pack(self, d):
+        return d[0] + '\x00' + d[1].encode('hex') + d[2] + '\n'
+
+    def text(self):
+        self._compact()
+        return self.data
 
     def diff(self, m2, clean=False):
         '''Finds changes between the current manifest and m2.'''
+        # XXX think whether efficiency matters here
         diff = {}
 
-        for fn, e1 in self.iteritems():
+        for fn, e1, flags in self.iterentries():
             if fn not in m2:
-                diff[fn] = e1, (None, '')
+                diff[fn] = (e1, flags), (None, '')
             else:
                 e2 = m2[fn]
                 if e1 != e2:
-                    diff[fn] = e1, e2
+                    diff[fn] = (e1, flags), e2
                 elif clean:
                     diff[fn] = None
 
-        for fn, e2 in m2.iteritems():
+        for fn, e2, flags in m2.iterentries():
             if fn not in self:
-                diff[fn] = (None, ''), e2
+                diff[fn] = (None, ''), (e2, flags)
 
         return diff
 
-    def filtercopy(self, filterfn):
-        c = _lazymanifest('')
-        for f, n, fl in self.iterentries():
-            if filterfn(f):
-                c[f] = n, fl
-        return c
+    def iterentries(self):
+        return LazyManifestIterEntries(self)
 
-    def text(self):
-        """Get the full data of this manifest as a bytestring."""
-        return _textv1(self.iterentries())
+    def __iter__(self):
+        return LazyManifestIter(self)
 
 try:
     _lazymanifest = parsers.lazymanifest