Patchwork [3,of,6,v6,lazy-manifest] manifest: split manifestdict into high-level and low-level logic

login
register
mail settings
Submitter Augie Fackler
Date March 7, 2015, 5:16 p.m.
Message ID <74e64852b07f9cfb5a7b.1425748578@imladris.local>
Download mbox | patch
Permalink /patch/7930/
State Accepted
Headers show

Comments

Augie Fackler - March 7, 2015, 5:16 p.m.
# HG changeset patch
# User Augie Fackler <augie@google.com>
# Date 1425747879 18000
#      Sat Mar 07 12:04:39 2015 -0500
# Node ID 74e64852b07f9cfb5a7b89d827dd9e1f01314b1b
# Parent  2524c117ce836e18402cac792361f0e44cac42c5
manifest: split manifestdict into high-level and low-level logic

The low-level logic type (_lazymanifest) matches the behavior of the C
implementation introduced in a5f1bccd. A future patch will use that
when available.
Martin von Zweigbergk - March 8, 2015, 6:43 a.m.
On Sat, Mar 7, 2015 at 9:25 AM Augie Fackler <raf@durin42.com> wrote:

> # HG changeset patch
> # User Augie Fackler <augie@google.com>
> # Date 1425747879 18000
> #      Sat Mar 07 12:04:39 2015 -0500
> # Node ID 74e64852b07f9cfb5a7b89d827dd9e1f01314b1b
> # Parent  2524c117ce836e18402cac792361f0e44cac42c5
> manifest: split manifestdict into high-level and low-level logic
>
> The low-level logic type (_lazymanifest) matches the behavior of the C
> implementation introduced in a5f1bccd. A future patch will use that
> when available.
>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -9,53 +9,119 @@ from i18n import _
>  import mdiff, parsers, error, revlog, util
>  import array, struct
>
> -# Pure Python fallback
> -def _parsemanifest(mfdict, fdict, lines):
> -    bin = revlog.bin
> -    for l in lines.splitlines():
> -        f, n = l.split('\0')
> -        if len(n) > 40:
> -            fdict[f] = n[40:]
> -            mfdict[f] = bin(n[:40])
> -        else:
> -            mfdict[f] = bin(n)
>
> -def _parse(lines, mfdict, flags):
> -    try:
> -        parsers.parse_manifest(mfdict, flags, lines)
> -    except AttributeError:
> -        _parsemanifest(mfdict, flags, lines)
> -    return mfdict
> +class _lazymanifest(dict):
> +    """This is the pure implementation of lazymanifest.
>
> -class manifestdict(dict):
> -    def __init__(self, data=''):
> -        self._flags = {}
> -        _parse(data, self, self._flags)
> +    It has not been optimized *at all* and is not lazy.
> +    """
> +
> +    def __init__(self, data):
> +        # This init method does a little bit of excessive-looking
> +        # precondition checking. This is so that the behavior of this
> +        # class exactly matches its C counterpart to try and help
> +        # prevent surprise breakage for anyone that develops against
> +        # the pure version.
> +        if data and data[-1] != '\n':
> +            raise ValueError('Manifest did not end in a newline.')
> +        dict.__init__(self)
> +        prev = None
> +        for l in data.splitlines():
> +            if prev is not None and prev > l:
> +                raise ValueError('Manifest lines not in sorted order.')
> +            prev = l
> +            f, n = l.split('\0')
> +            if len(n) > 40:
> +                self[f] = revlog.bin(n[:40]), n[40:]
> +            else:
> +                self[f] = revlog.bin(n), ''
>
>      def __setitem__(self, k, v):
> -        assert v is not None
> -        dict.__setitem__(self, k, v)
> -    def flags(self, f):
> -        return self._flags.get(f, "")
> -    def setflag(self, f, flags):
> -        """Set the flags (symlink, executable) for path f."""
> -        self._flags[f] = flags
> +        node, flag = v
> +        assert node is not None
> +        if len(node) > 21:
> +            node = node[:21] # match c implementation behavior
> +        dict.__setitem__(self, k, (node, flag))
> +
> +    def __iter__(self):
> +        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
>

What's the reason for yielding (f, n, fl) instead of the more natural (f,
(n, fl))? The infamous effect tuples have on Python's GC? Also see
questions further down.

+
>      def copy(self):
> -        copy = manifestdict()
> -        dict.__init__(copy, self)
> -        copy._flags = dict.copy(self._flags)
> -        return copy
> +        c = _lazymanifest('')
> +        c.update(self)
> +        return c
> +
> +    def diff(self, m2, clean=False):
> +        '''Finds changes between the current manifest and m2.'''
> +        diff = {}
> +
> +        for fn, e1 in self.iteritems():
> +            if fn not in m2:
> +                diff[fn] = e1, (None, '')
> +            else:
> +                e2 = m2[fn]
> +                if e1 != e2:
> +                    diff[fn] = e1, e2
> +                elif clean:
> +                    diff[fn] = None
> +
> +        for fn, e2 in m2.iteritems():
> +            if fn not in self:
> +                diff[fn] = (None, ''), e2
> +
> +        return diff
> +
> +    def filtercopy(self, filterfn):
> +        c = _lazymanifest('')
> +        for f, n, fl in self:
> +            if filterfn(f):
> +                c[f] = n, fl
> +        return c
> +
> +    def text(self):
> +        """Get the full data of this manifest as a bytestring."""
> +        fl = sorted(self)
> +
> +        _hex = revlog.hex
> +        # if this is changed to support newlines in filenames,
> +        # be sure to check the templates/ dir again (especially
> *-raw.tmpl)
> +        return ''.join("%s\0%s%s\n" % (
> +            f, _hex(n[:20]), flag) for f, n, flag in fl)
> +
> +class manifestdict(object):
>

So _lazymanifest is not lazy, manifestdict is not a dict, but _lazymanifest
is a dict :-) Just a note, not asking for anything to change.


> +    def __init__(self, data=''):
> +        self._lm = _lazymanifest(data)
> +
> +    def __getitem__(self, key):
> +        return self._lm[key][0]
> +
> +    def __len__(self):
> +        return len(self._lm)
> +
> +    def __setitem__(self, key, node):
> +        self._lm[key] = node, self.flags(key, '')
> +
> +    def __contains__(self, key):
> +        return key in self._lm
> +
> +    def __delitem__(self, key):
> +        del self._lm[key]
> +
> +    def keys(self):
> +        return [x[0] for x in self._lm]
> +
> +    def iterkeys(self):
> +        return iter(self.keys())
> +
>      def intersectfiles(self, files):
> -        '''make a new manifestdict with the intersection of self with
> files
> +        '''make a new lazymanifest with the intersection of self with
> files
>
>          The algorithm assumes that files is much smaller than self.'''
>          ret = manifestdict()
> +        lm = self._lm
>          for fn in files:
> -            if fn in self:
> -                ret[fn] = self[fn]
> -                flags = self._flags.get(fn, None)
> -                if flags:
> -                    ret._flags[fn] = flags
> +            if fn in lm:
> +                ret._lm[fn] = self._lm[fn]
>          return ret
>
>      def filesnotin(self, m2):
> @@ -74,11 +140,9 @@ class manifestdict(dict):
>              (not match.anypats() and util.all(fn in self for fn in
> files))):
>              return self.intersectfiles(files)
>
> -        m = self.copy()
> -        for fn in m.keys():
> -            if not match(fn):
> -                del m[fn]
> -        return m
> +        lm = manifestdict('')
> +        lm._lm = self._lm.filtercopy(match)
> +        return lm
>
>      def diff(self, m2, clean=False):
>          '''Finds changes between the current manifest and m2.
> @@ -95,35 +159,36 @@ class manifestdict(dict):
>          the nodeid will be None and the flags will be the empty
>          string.
>          '''
> -        diff = {}
> +        return self._lm.diff(m2._lm, clean)
>
> -        for fn, n1 in self.iteritems():
> -            fl1 = self._flags.get(fn, '')
> -            n2 = m2.get(fn, None)
> -            fl2 = m2._flags.get(fn, '')
> -            if n2 is None:
> -                fl2 = ''
> -            if n1 != n2 or fl1 != fl2:
> -                diff[fn] = ((n1, fl1), (n2, fl2))
> -            elif clean:
> -                diff[fn] = None
> +    def setflag(self, key, flag):
> +        self._lm[key] = self[key], flag
>
> -        for fn, n2 in m2.iteritems():
> -            if fn not in self:
> -                fl2 = m2._flags.get(fn, '')
> -                diff[fn] = ((None, ''), (n2, fl2))
> +    def get(self, key, default=None):
> +        try:
> +            return self._lm[key][0]
> +        except KeyError:
> +            return default
>
> -        return diff
> +    def flags(self, key, default=''):
> +        try:
> +            return self._lm[key][1]
> +        except KeyError:
> +            return default
> +
> +    def __iter__(self):
> +        return (x[0] for x in self._lm)
>

If the answer to my question above was about GC, then wouldn't this (the
creation of tuples that are immediately thrown away) be an equally big
problem? If the low-level type's __iter__ was left untouched (and
iteritems() would be used where __iter__ is currently used), then there
wouldn't be any tuples involved here.


> +
> +    def copy(self):
> +        c = manifestdict('')
> +        c._lm = self._lm.copy()
> +        return c
> +
> +    def iteritems(self):
> +        return (x[:2] for x in self._lm)
>

Similar comment here: this creates a 3-tuple that gets converted into a
2-tuple. That does seem better than creating 2 2-tuples (f, (n, e)) that
converted into another one (f, n). Even better would be to add a new
filesandnodes() generator that yields exactly the wanted (f, n) tuples, no?
OTOH, is that really what we want in many places? I have a feeling that
most places that user iteritems() would actually prefer all three fields.
Yielding (f, (n, e)) seems cleaner to me, but if GC is a concern, then
perhaps we'd have to yield (f, n, e). What do you think?


>
>      def text(self):
> -        """Get the full data of this manifest as a bytestring."""
> -        fl = sorted(self)
> -        _checkforbidden(fl)
> -
> -        hex, flags = revlog.hex, self.flags
> -        # if this is changed to support newlines in filenames,
> -        # be sure to check the templates/ dir again (especially
> *-raw.tmpl)
> -        return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f
> in fl)
> +        return self._lm.text()
>
>      def fastdelta(self, base, changes):
>          """Given a base manifest text as an array.array and a list of
> changes
> @@ -143,7 +208,8 @@ class manifestdict(dict):
>              # bs will either be the index of the item or the insert point
>              start, end = _msearch(addbuf, f, start)
>              if not todelete:
> -                l = "%s\0%s%s\n" % (f, revlog.hex(self[f]), self.flags(f))
> +                h, fl = self._lm[f]
> +                l = "%s\0%s%s\n" % (f, revlog.hex(h), fl)
>              else:
>                  if start == end:
>                      # item we want to delete was not found, error out
> @@ -280,12 +346,10 @@ class manifest(revlog.revlog):
>              m = self._mancache[node][0]
>              return m.get(f), m.flags(f)
>          text = self.revision(node)
> -        start, end = _msearch(text, f)
> -        if start == end:
> +        try:
> +            return manifestdict(text)._lm[f]
> +        except KeyError:
>              return None, None
> -        l = text[start:end]
> -        f, n = l.split('\0')
> -        return revlog.bin(n[:40]), n[40:-1]
>
>      def add(self, m, transaction, link, p1, p2, added, removed):
>          if p1 in self._mancache:
> diff --git a/tests/test-manifest.py b/tests/test-manifest.py
> --- a/tests/test-manifest.py
> +++ b/tests/test-manifest.py
> @@ -4,7 +4,7 @@ import itertools
>
>  import silenttestrunner
>
> -from mercurial import parsers
> +from mercurial import manifest as manifestmod
>
>  HASH_1 = '1' * 40
>  HASH_2 = 'f' * 40
> @@ -38,12 +38,12 @@ class testmanifest(unittest.TestCase):
>          self.assert_(thing in container, msg)
>
>      def testEmptyManifest(self):
> -        m = parsers.lazymanifest('')
> +        m = manifestmod._lazymanifest('')
>          self.assertEqual(0, len(m))
>          self.assertEqual([], list(m))
>
>      def testManifest(self):
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          want = [
>              ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
>              ('foo', binascii.unhexlify(HASH_1), ''),
> @@ -58,13 +58,13 @@ class testmanifest(unittest.TestCase):
>      def testSetItem(self):
>          want = binascii.unhexlify(HASH_1), ''
>
> -        m = parsers.lazymanifest('')
> +        m = manifestmod._lazymanifest('')
>          m['a'] = want
>          self.assertIn('a', m)
>          self.assertEqual(want, m['a'])
>          self.assertEqual('a\0' + HASH_1 + '\n', m.text())
>
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          m['a'] = want
>          self.assertEqual(want, m['a'])
>          self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
> @@ -76,7 +76,7 @@ class testmanifest(unittest.TestCase):
>      def testCompaction(self):
>          unhex = binascii.unhexlify
>          h1, h2 = unhex(HASH_1), unhex(HASH_2)
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          m['alpha'] = h1, ''
>          m['beta'] = h2, ''
>          del m['foo']
> @@ -91,8 +91,8 @@ class testmanifest(unittest.TestCase):
>          self.assertEqual(w, list(m))
>
>      def testSetGetNodeSuffix(self):
> -        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        clean = manifestmod._lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          h, f = m['foo']
>          want = h + 'a', f
>          # Merge code wants to set 21-byte fake hashes at times
> @@ -120,7 +120,7 @@ class testmanifest(unittest.TestCase):
>          self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
>
>      def testFilterCopyException(self):
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          def filt(path):
>              if path == 'foo':
>                  assert False
> @@ -128,7 +128,7 @@ class testmanifest(unittest.TestCase):
>          self.assertRaises(AssertionError, m.filtercopy, filt)
>
>      def testRemoveItem(self):
> -        m = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          del m['foo']
>          self.assertRaises(KeyError, lambda : m['foo'])
>          self.assertEqual(1, len(m))
> @@ -138,9 +138,9 @@ class testmanifest(unittest.TestCase):
>          MISSING = (None, '')
>          addl = 'z-only-in-left\0' + HASH_1 + '\n'
>          addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
> -        left = parsers.lazymanifest(
> +        left = manifestmod._lazymanifest(
>              A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
> -        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
> +        right = manifestmod._lazymanifest(A_SHORT_MANIFEST + addr)
>          want = {
>              'foo': ((binascii.unhexlify(HASH_3), 'x'),
>                      (binascii.unhexlify(HASH_1), '')),
> @@ -154,14 +154,14 @@ class testmanifest(unittest.TestCase):
>              'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
>              'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
>              }
> -        self.assertEqual(want, parsers.lazymanifest('').diff(left))
> +        self.assertEqual(want, manifestmod._lazymanifest('').diff(left))
>
>          want = {
>              'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'),
> MISSING),
>              'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
>              'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
>              }
> -        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
> +        self.assertEqual(want, left.diff(manifestmod._lazymanifest('')))
>          copy = right.copy()
>          del copy['z-only-in-right']
>          del right['foo']
> @@ -171,7 +171,7 @@ class testmanifest(unittest.TestCase):
>              }
>          self.assertEqual(want, right.diff(copy))
>
> -        short = parsers.lazymanifest(A_SHORT_MANIFEST)
> +        short = manifestmod._lazymanifest(A_SHORT_MANIFEST)
>          pruned = short.copy()
>          del pruned['foo']
>          want = {
> @@ -192,30 +192,37 @@ class testmanifest(unittest.TestCase):
>          backwards = ''.join(
>              l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if
> l)
>          try:
> -            parsers.lazymanifest(backwards)
> +            manifestmod._lazymanifest(backwards)
>              self.fail('Should have raised ValueError')
>          except ValueError, v:
>              self.assertIn('Manifest lines not in sorted order.', str(v))
>
>      def testNoTerminalNewline(self):
>          try:
> -            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
> +            manifestmod._lazymanifest(A_SHORT_MANIFEST + 'wat')
>              self.fail('Should have raised ValueError')
>          except ValueError, v:
>              self.assertIn('Manifest did not end in a newline.', str(v))
>
>      def testNoNewLineAtAll(self):
>          try:
> -            parsers.lazymanifest('wat')
> +            manifestmod._lazymanifest('wat')
>              self.fail('Should have raised ValueError')
>          except ValueError, v:
>              self.assertIn('Manifest did not end in a newline.', str(v))
>
>      def testHugeManifest(self):
> -        m = parsers.lazymanifest(A_HUGE_MANIFEST)
> +        m = manifestmod._lazymanifest(A_HUGE_MANIFEST)
>          self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
>          self.assertEqual(len(m), len(list(m)))
>
> +    def testIntersectFiles(self):
> +        m = manifestmod.manifestdict(A_HUGE_MANIFEST)
> +        m2 = m.intersectfiles(['file1', 'file200', 'file300'])
> +        w = ('file1\0%sx\n'
> +             'file200\0%sl\n'
> +             'file300\0%s\n') % (HASH_2, HASH_1, HASH_1)
> +        self.assertEqual(w, m2.text())
>
>  if __name__ == '__main__':
>      silenttestrunner.main(__name__)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>
Augie Fackler - March 9, 2015, 1:19 p.m.
On Mar 8, 2015, at 1:43 AM, Martin von Zweigbergk <martinvonz@google.com> wrote:
> On Sat, Mar 7, 2015 at 9:25 AM Augie Fackler <raf@durin42.com> wrote:
>> # HG changeset patch
>> # User Augie Fackler <augie@google.com>
>> # Date 1425747879 18000
>> #      Sat Mar 07 12:04:39 2015 -0500
>> # Node ID 74e64852b07f9cfb5a7b89d827dd9e1f01314b1b
>> # Parent  2524c117ce836e18402cac792361f0e44cac42c5
>> manifest: split manifestdict into high-level and low-level logic
>> 
>> 
>> +    def __iter__(self):
>> +        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
>> 
> 
> What's the reason for yielding (f, n, fl) instead of the more natural (f, (n, fl))? The infamous effect tuples have on Python's GC? Also see questions further down.

It's mostly the GC concern, but it also simplified the implementation of the C type.

>> +
>> +class manifestdict(object):
>> 
> 
> So _lazymanifest is not lazy, manifestdict is not a dict, but _lazymanifest is a dict :-) Just a note, not asking for anything to change.

Yup.

>> +
>> +    def __iter__(self):
>> +        return (x[0] for x in self._lm)
>> 
> If the answer to my question above was about GC, then wouldn't this (the creation of tuples that are immediately thrown away) be an equally big problem? If the low-level type's __iter__ was left untouched (and iteritems() would be used where __iter__ is currently used), then there wouldn't be any tuples involved here.

It's probably worth benchmarking at some point in the future, and/or adding more iterator methods to the C type. For now, I was going for the simplest thing that could work.

>  
>> +
>> +    def copy(self):
>> +        c = manifestdict('')
>> +        c._lm = self._lm.copy()
>> +        return c
>> +
>> +    def iteritems(self):
>> +        return (x[:2] for x in self._lm)
> 
> Similar comment here: this creates a 3-tuple that gets converted into a 2-tuple. That does seem better than creating 2 2-tuples (f, (n, e)) that converted into another one (f, n). Even better would be to add a new filesandnodes() generator that yields exactly the wanted (f, n) tuples, no? OTOH, is that really what we want in many places? I have a feeling that most places that user iteritems() would actually prefer all three fields. Yielding (f, (n, e)) seems cleaner to me, but if GC is a concern, then perhaps we'd have to yield (f, n, e). What do you think?

That matches my guess too, but that feels like a followup patch to me. In general, I think I want to move to iterating over the 3-tuple everywhere just so things are more consistent, but that feels too invasive to do all at once.

Patch

diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -9,53 +9,119 @@  from i18n import _
 import mdiff, parsers, error, revlog, util
 import array, struct
 
-# Pure Python fallback
-def _parsemanifest(mfdict, fdict, lines):
-    bin = revlog.bin
-    for l in lines.splitlines():
-        f, n = l.split('\0')
-        if len(n) > 40:
-            fdict[f] = n[40:]
-            mfdict[f] = bin(n[:40])
-        else:
-            mfdict[f] = bin(n)
 
-def _parse(lines, mfdict, flags):
-    try:
-        parsers.parse_manifest(mfdict, flags, lines)
-    except AttributeError:
-        _parsemanifest(mfdict, flags, lines)
-    return mfdict
+class _lazymanifest(dict):
+    """This is the pure implementation of lazymanifest.
 
-class manifestdict(dict):
-    def __init__(self, data=''):
-        self._flags = {}
-        _parse(data, self, self._flags)
+    It has not been optimized *at all* and is not lazy.
+    """
+
+    def __init__(self, data):
+        # This init method does a little bit of excessive-looking
+        # precondition checking. This is so that the behavior of this
+        # class exactly matches its C counterpart to try and help
+        # prevent surprise breakage for anyone that develops against
+        # the pure version.
+        if data and data[-1] != '\n':
+            raise ValueError('Manifest did not end in a newline.')
+        dict.__init__(self)
+        prev = None
+        for l in data.splitlines():
+            if prev is not None and prev > l:
+                raise ValueError('Manifest lines not in sorted order.')
+            prev = l
+            f, n = l.split('\0')
+            if len(n) > 40:
+                self[f] = revlog.bin(n[:40]), n[40:]
+            else:
+                self[f] = revlog.bin(n), ''
 
     def __setitem__(self, k, v):
-        assert v is not None
-        dict.__setitem__(self, k, v)
-    def flags(self, f):
-        return self._flags.get(f, "")
-    def setflag(self, f, flags):
-        """Set the flags (symlink, executable) for path f."""
-        self._flags[f] = flags
+        node, flag = v
+        assert node is not None
+        if len(node) > 21:
+            node = node[:21] # match c implementation behavior
+        dict.__setitem__(self, k, (node, flag))
+
+    def __iter__(self):
+        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
+
     def copy(self):
-        copy = manifestdict()
-        dict.__init__(copy, self)
-        copy._flags = dict.copy(self._flags)
-        return copy
+        c = _lazymanifest('')
+        c.update(self)
+        return c
+
+    def diff(self, m2, clean=False):
+        '''Finds changes between the current manifest and m2.'''
+        diff = {}
+
+        for fn, e1 in self.iteritems():
+            if fn not in m2:
+                diff[fn] = e1, (None, '')
+            else:
+                e2 = m2[fn]
+                if e1 != e2:
+                    diff[fn] = e1, e2
+                elif clean:
+                    diff[fn] = None
+
+        for fn, e2 in m2.iteritems():
+            if fn not in self:
+                diff[fn] = (None, ''), e2
+
+        return diff
+
+    def filtercopy(self, filterfn):
+        c = _lazymanifest('')
+        for f, n, fl in self:
+            if filterfn(f):
+                c[f] = n, fl
+        return c
+
+    def text(self):
+        """Get the full data of this manifest as a bytestring."""
+        fl = sorted(self)
+
+        _hex = revlog.hex
+        # if this is changed to support newlines in filenames,
+        # be sure to check the templates/ dir again (especially *-raw.tmpl)
+        return ''.join("%s\0%s%s\n" % (
+            f, _hex(n[:20]), flag) for f, n, flag in fl)
+
+class manifestdict(object):
+    def __init__(self, data=''):
+        self._lm = _lazymanifest(data)
+
+    def __getitem__(self, key):
+        return self._lm[key][0]
+
+    def __len__(self):
+        return len(self._lm)
+
+    def __setitem__(self, key, node):
+        self._lm[key] = node, self.flags(key, '')
+
+    def __contains__(self, key):
+        return key in self._lm
+
+    def __delitem__(self, key):
+        del self._lm[key]
+
+    def keys(self):
+        return [x[0] for x in self._lm]
+
+    def iterkeys(self):
+        return iter(self.keys())
+
     def intersectfiles(self, files):
-        '''make a new manifestdict with the intersection of self with files
+        '''make a new lazymanifest with the intersection of self with files
 
         The algorithm assumes that files is much smaller than self.'''
         ret = manifestdict()
+        lm = self._lm
         for fn in files:
-            if fn in self:
-                ret[fn] = self[fn]
-                flags = self._flags.get(fn, None)
-                if flags:
-                    ret._flags[fn] = flags
+            if fn in lm:
+                ret._lm[fn] = self._lm[fn]
         return ret
 
     def filesnotin(self, m2):
@@ -74,11 +140,9 @@  class manifestdict(dict):
             (not match.anypats() and util.all(fn in self for fn in files))):
             return self.intersectfiles(files)
 
-        m = self.copy()
-        for fn in m.keys():
-            if not match(fn):
-                del m[fn]
-        return m
+        lm = manifestdict('')
+        lm._lm = self._lm.filtercopy(match)
+        return lm
 
     def diff(self, m2, clean=False):
         '''Finds changes between the current manifest and m2.
@@ -95,35 +159,36 @@  class manifestdict(dict):
         the nodeid will be None and the flags will be the empty
         string.
         '''
-        diff = {}
+        return self._lm.diff(m2._lm, clean)
 
-        for fn, n1 in self.iteritems():
-            fl1 = self._flags.get(fn, '')
-            n2 = m2.get(fn, None)
-            fl2 = m2._flags.get(fn, '')
-            if n2 is None:
-                fl2 = ''
-            if n1 != n2 or fl1 != fl2:
-                diff[fn] = ((n1, fl1), (n2, fl2))
-            elif clean:
-                diff[fn] = None
+    def setflag(self, key, flag):
+        self._lm[key] = self[key], flag
 
-        for fn, n2 in m2.iteritems():
-            if fn not in self:
-                fl2 = m2._flags.get(fn, '')
-                diff[fn] = ((None, ''), (n2, fl2))
+    def get(self, key, default=None):
+        try:
+            return self._lm[key][0]
+        except KeyError:
+            return default
 
-        return diff
+    def flags(self, key, default=''):
+        try:
+            return self._lm[key][1]
+        except KeyError:
+            return default
+
+    def __iter__(self):
+        return (x[0] for x in self._lm)
+
+    def copy(self):
+        c = manifestdict('')
+        c._lm = self._lm.copy()
+        return c
+
+    def iteritems(self):
+        return (x[:2] for x in self._lm)
 
     def text(self):
-        """Get the full data of this manifest as a bytestring."""
-        fl = sorted(self)
-        _checkforbidden(fl)
-
-        hex, flags = revlog.hex, self.flags
-        # if this is changed to support newlines in filenames,
-        # be sure to check the templates/ dir again (especially *-raw.tmpl)
-        return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f in fl)
+        return self._lm.text()
 
     def fastdelta(self, base, changes):
         """Given a base manifest text as an array.array and a list of changes
@@ -143,7 +208,8 @@  class manifestdict(dict):
             # bs will either be the index of the item or the insert point
             start, end = _msearch(addbuf, f, start)
             if not todelete:
-                l = "%s\0%s%s\n" % (f, revlog.hex(self[f]), self.flags(f))
+                h, fl = self._lm[f]
+                l = "%s\0%s%s\n" % (f, revlog.hex(h), fl)
             else:
                 if start == end:
                     # item we want to delete was not found, error out
@@ -280,12 +346,10 @@  class manifest(revlog.revlog):
             m = self._mancache[node][0]
             return m.get(f), m.flags(f)
         text = self.revision(node)
-        start, end = _msearch(text, f)
-        if start == end:
+        try:
+            return manifestdict(text)._lm[f]
+        except KeyError:
             return None, None
-        l = text[start:end]
-        f, n = l.split('\0')
-        return revlog.bin(n[:40]), n[40:-1]
 
     def add(self, m, transaction, link, p1, p2, added, removed):
         if p1 in self._mancache:
diff --git a/tests/test-manifest.py b/tests/test-manifest.py
--- a/tests/test-manifest.py
+++ b/tests/test-manifest.py
@@ -4,7 +4,7 @@  import itertools
 
 import silenttestrunner
 
-from mercurial import parsers
+from mercurial import manifest as manifestmod
 
 HASH_1 = '1' * 40
 HASH_2 = 'f' * 40
@@ -38,12 +38,12 @@  class testmanifest(unittest.TestCase):
         self.assert_(thing in container, msg)
 
     def testEmptyManifest(self):
-        m = parsers.lazymanifest('')
+        m = manifestmod._lazymanifest('')
         self.assertEqual(0, len(m))
         self.assertEqual([], list(m))
 
     def testManifest(self):
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         want = [
             ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
             ('foo', binascii.unhexlify(HASH_1), ''),
@@ -58,13 +58,13 @@  class testmanifest(unittest.TestCase):
     def testSetItem(self):
         want = binascii.unhexlify(HASH_1), ''
 
-        m = parsers.lazymanifest('')
+        m = manifestmod._lazymanifest('')
         m['a'] = want
         self.assertIn('a', m)
         self.assertEqual(want, m['a'])
         self.assertEqual('a\0' + HASH_1 + '\n', m.text())
 
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         m['a'] = want
         self.assertEqual(want, m['a'])
         self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
@@ -76,7 +76,7 @@  class testmanifest(unittest.TestCase):
     def testCompaction(self):
         unhex = binascii.unhexlify
         h1, h2 = unhex(HASH_1), unhex(HASH_2)
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         m['alpha'] = h1, ''
         m['beta'] = h2, ''
         del m['foo']
@@ -91,8 +91,8 @@  class testmanifest(unittest.TestCase):
         self.assertEqual(w, list(m))
 
     def testSetGetNodeSuffix(self):
-        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        clean = manifestmod._lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         h, f = m['foo']
         want = h + 'a', f
         # Merge code wants to set 21-byte fake hashes at times
@@ -120,7 +120,7 @@  class testmanifest(unittest.TestCase):
         self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
 
     def testFilterCopyException(self):
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         def filt(path):
             if path == 'foo':
                 assert False
@@ -128,7 +128,7 @@  class testmanifest(unittest.TestCase):
         self.assertRaises(AssertionError, m.filtercopy, filt)
 
     def testRemoveItem(self):
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         del m['foo']
         self.assertRaises(KeyError, lambda : m['foo'])
         self.assertEqual(1, len(m))
@@ -138,9 +138,9 @@  class testmanifest(unittest.TestCase):
         MISSING = (None, '')
         addl = 'z-only-in-left\0' + HASH_1 + '\n'
         addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
-        left = parsers.lazymanifest(
+        left = manifestmod._lazymanifest(
             A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
-        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
+        right = manifestmod._lazymanifest(A_SHORT_MANIFEST + addr)
         want = {
             'foo': ((binascii.unhexlify(HASH_3), 'x'),
                     (binascii.unhexlify(HASH_1), '')),
@@ -154,14 +154,14 @@  class testmanifest(unittest.TestCase):
             'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
             'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
             }
-        self.assertEqual(want, parsers.lazymanifest('').diff(left))
+        self.assertEqual(want, manifestmod._lazymanifest('').diff(left))
 
         want = {
             'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'), MISSING),
             'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
             'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
             }
-        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
+        self.assertEqual(want, left.diff(manifestmod._lazymanifest('')))
         copy = right.copy()
         del copy['z-only-in-right']
         del right['foo']
@@ -171,7 +171,7 @@  class testmanifest(unittest.TestCase):
             }
         self.assertEqual(want, right.diff(copy))
 
-        short = parsers.lazymanifest(A_SHORT_MANIFEST)
+        short = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         pruned = short.copy()
         del pruned['foo']
         want = {
@@ -192,30 +192,37 @@  class testmanifest(unittest.TestCase):
         backwards = ''.join(
             l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if l)
         try:
-            parsers.lazymanifest(backwards)
+            manifestmod._lazymanifest(backwards)
             self.fail('Should have raised ValueError')
         except ValueError, v:
             self.assertIn('Manifest lines not in sorted order.', str(v))
 
     def testNoTerminalNewline(self):
         try:
-            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
+            manifestmod._lazymanifest(A_SHORT_MANIFEST + 'wat')
             self.fail('Should have raised ValueError')
         except ValueError, v:
             self.assertIn('Manifest did not end in a newline.', str(v))
 
     def testNoNewLineAtAll(self):
         try:
-            parsers.lazymanifest('wat')
+            manifestmod._lazymanifest('wat')
             self.fail('Should have raised ValueError')
         except ValueError, v:
             self.assertIn('Manifest did not end in a newline.', str(v))
 
     def testHugeManifest(self):
-        m = parsers.lazymanifest(A_HUGE_MANIFEST)
+        m = manifestmod._lazymanifest(A_HUGE_MANIFEST)
         self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
         self.assertEqual(len(m), len(list(m)))
 
+    def testIntersectFiles(self):
+        m = manifestmod.manifestdict(A_HUGE_MANIFEST)
+        m2 = m.intersectfiles(['file1', 'file200', 'file300'])
+        w = ('file1\0%sx\n'
+             'file200\0%sl\n'
+             'file300\0%s\n') % (HASH_2, HASH_1, HASH_1)
+        self.assertEqual(w, m2.text())
 
 if __name__ == '__main__':
     silenttestrunner.main(__name__)