Patchwork [2,of,4,lazymanifest,v5] manifest: use lazymanifest instead of manifestdict

login
register
mail settings
Submitter Augie Fackler
Date March 6, 2015, 9:42 p.m.
Message ID <fce65813a0d09a4c4964.1425678160@arthedain.pit.corp.google.com>
Download mbox | patch
Permalink /patch/7909/
State Changes Requested
Headers show

Comments

Augie Fackler - March 6, 2015, 9:42 p.m.
# HG changeset patch
# User Augie Fackler <augie@google.com>
# Date 1420746668 18000
#      Thu Jan 08 14:51:08 2015 -0500
# Node ID fce65813a0d09a4c4964b987231d7c2de8d9754f
# Parent  2650f8aeeaa3d811f4725be300eb7ffa2f649a45
manifest: use lazymanifest instead of manifestdict

Before any of my related changes on mozilla-central:

perfmanifest tip
! wall 0.268805 comb 0.260000 user 0.260000 sys 0.000000 (best of 37)
perftags
! result: 162
! wall 0.007099 comb 0.000000 user 0.000000 sys 0.000000 (best of 401)
perfstatus
! wall 0.415680 comb 0.420000 user 0.260000 sys 0.160000 (best of 24)
hgperf export tip
! wall 0.142118 comb 0.140000 user 0.140000 sys 0.000000 (best of 67)

after all of my changes on mozilla-central:

./hg:
perfmanifest tip
! wall 0.232640 comb 0.230000 user 0.220000 sys 0.010000 (best of 43)
perftags
! result: 162
! wall 0.007057 comb 0.010000 user 0.000000 sys 0.010000 (best of 395)
perfstatus
! wall 0.415503 comb 0.420000 user 0.280000 sys 0.140000 (best of 24)
hgperf export tip
! wall 0.025096 comb 0.030000 user 0.030000 sys 0.000000 (best of 102)

so it's no real change in performance on perf{manifest,tags,status},
but is a huge win on 'hgperf export tip'.

This should open the door to doing fast hashing on a per-directory
basis for the manifest type, which is the current plan for supporting
narrow-clone-friendly manifests.

There's a little performance work that could still be done here:
fastdelta() could be done significantly more intelligently by using
the internal state of the lazymanifest type in C, but that seems like
good future work.

Users of pure-Python Mercurial will likely want to spend time
optimizing the version of lazymanifest in pure/parsers.py. Right now
it's the smallest amount of work I could do to make Mercurial still
work in pure-Python, and isn't at all lazy.

Patch

diff --git a/hgext/largefiles/reposetup.py b/hgext/largefiles/reposetup.py
--- a/hgext/largefiles/reposetup.py
+++ b/hgext/largefiles/reposetup.py
@@ -38,17 +38,18 @@  def reposetup(ui, repo):
         def __getitem__(self, changeid):
             ctx = super(lfilesrepo, self).__getitem__(changeid)
             if self.lfstatus:
-                class lfilesmanifestdict(manifest.manifestdict):
+                class lfilesmanifest(manifest.lazymanifest):
                     def __contains__(self, filename):
-                        orig = super(lfilesmanifestdict, self).__contains__
+                        orig = super(lfilesmanifest, self).__contains__
                         return orig(filename) or orig(lfutil.standin(filename))
+
                 class lfilesctx(ctx.__class__):
                     def files(self):
                         filenames = super(lfilesctx, self).files()
                         return [lfutil.splitstandin(f) or f for f in filenames]
                     def manifest(self):
                         man1 = super(lfilesctx, self).manifest()
-                        man1.__class__ = lfilesmanifestdict
+                        man1.__class__ = lfilesmanifest
                         return man1
                     def filectx(self, path, fileid=None, filelog=None):
                         orig = super(lfilesctx, self).filectx
diff --git a/mercurial/manifest.c b/mercurial/manifest.c
--- a/mercurial/manifest.c
+++ b/mercurial/manifest.c
@@ -163,7 +163,7 @@  static int lazymanifest_init(lazymanifes
 	self->maxlines = DEFAULT_LINES;
 	self->numlines = 0;
 	if (!self->lines)
-		ret = -2;
+		ret = MANIFEST_OOM;
 	else
 		ret = find_lines(self, data, len);
 	Py_END_ALLOW_THREADS
diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -6,36 +6,127 @@ 
 # GNU General Public License version 2 or any later version.
 
 from i18n import _
-import mdiff, parsers, error, revlog, util
+import mdiff, parsers, error, revlog, util, node
 import array, struct
 
-class manifestdict(dict):
-    def __init__(self):
-        self._flags = {}
+
+class _lazymanifest(dict):
+    """This is the pure implementation of lazymanifest.
+
+    It has not been optimized *at all* and is not lazy.
+    """
+
+    def __init__(self, data):
+        # This init method does a little bit of excessive-looking
+        # precondition checking. This is so that the behavior of this
+        # class exactly matches its C counterpart to try and help
+        # prevent surprise breakage for anyone that develops against
+        # the pure version.
+        if data and data[-1] != '\n':
+            raise ValueError('Manifest did not end in a newline.')
+        dict.__init__(self)
+        prev = None
+        for l in data.splitlines():
+            if prev is not None and prev > l:
+                raise ValueError('Manifest lines not in sorted order.')
+            prev = l
+            f, n = l.split('\0')
+            if len(n) > 40:
+                self[f] = node.bin(n[:40]), n[40:]
+            else:
+                self[f] = node.bin(n), ''
+
     def __setitem__(self, k, v):
-        assert v is not None
-        dict.__setitem__(self, k, v)
-    def flags(self, f):
-        return self._flags.get(f, "")
-    def setflag(self, f, flags):
-        """Set the flags (symlink, executable) for path f."""
-        self._flags[f] = flags
+        node, flag = v
+        assert node is not None
+        if len(node) > 21:
+            node = node[:21] # match c implementation behavior
+        dict.__setitem__(self, k, (node, flag))
+
+    def __iter__(self):
+        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
+
     def copy(self):
-        copy = manifestdict()
-        dict.__init__(copy, self)
-        copy._flags = dict.copy(self._flags)
-        return copy
+        c = _lazymanifest('')
+        c.update(self)
+        return c
+
+    def diff(self, m2, clean=False):
+        '''Finds changes between the current manifest and m2.'''
+        diff = {}
+
+        for fn, e1 in self.iteritems():
+            if fn not in m2:
+                diff[fn] = e1, (None, '')
+            else:
+                e2 = m2[fn]
+                if e1 != e2:
+                    diff[fn] = e1, e2
+                elif clean:
+                    diff[fn] = None
+
+        for fn, e2 in m2.iteritems():
+            if fn not in self:
+                diff[fn] = (None, ''), e2
+
+        return diff
+
+    def filtercopy(self, filterfn):
+        c = _lazymanifest('')
+        for f, n, fl in self:
+            if filterfn(f):
+                c[f] = n, fl
+        return c
+
+    def text(self):
+        """Get the full data of this manifest as a bytestring."""
+        fl = sorted(self)
+
+        _hex = node.hex
+        # if this is changed to support newlines in filenames,
+        # be sure to check the templates/ dir again (especially *-raw.tmpl)
+        return ''.join("%s\0%s%s\n" % (
+            f, _hex(n[:20]), flag) for f, n, flag in fl)
+
+try:
+    _lazymanifest = parsers.lazymanifest
+except AttributeError:
+    pass
+
+class lazymanifest(object):
+    def __init__(self, data):
+        self._lm = _lazymanifest(data)
+
+    def __getitem__(self, key):
+        return self._lm[key][0]
+
+    def __len__(self):
+        return len(self._lm)
+
+    def __setitem__(self, key, node):
+        self._lm[key] = node, self.flags(key, '')
+
+    def __contains__(self, key):
+        return key in self._lm
+
+    def __delitem__(self, key):
+        del self._lm[key]
+
+    def keys(self):
+        return [x[0] for x in self._lm]
+
+    def iterkeys(self):
+        return iter(self.keys())
+
     def intersectfiles(self, files):
-        '''make a new manifestdict with the intersection of self with files
+        '''make a new lazymanifest with the intersection of self with files
 
         The algorithm assumes that files is much smaller than self.'''
-        ret = manifestdict()
+        ret = lazymanifest('')
+        lm = self._lm
         for fn in files:
-            if fn in self:
-                ret[fn] = self[fn]
-                flags = self._flags.get(fn, None)
-                if flags:
-                    ret._flags[fn] = flags
+            if fn in lm:
+                ret._lm[fn] = self._lm[fn]
         return ret
 
     def filesnotin(self, m2):
@@ -54,11 +145,9 @@  class manifestdict(dict):
             (not match.anypats() and util.all(fn in self for fn in files))):
             return self.intersectfiles(files)
 
-        m = self.copy()
-        for fn in m.keys():
-            if not match(fn):
-                del m[fn]
-        return m
+        lm = lazymanifest('')
+        lm._lm = self._lm.filtercopy(match)
+        return lm
 
     def diff(self, m2, clean=False):
         '''Finds changes between the current manifest and m2.
@@ -75,35 +164,36 @@  class manifestdict(dict):
         the nodeid will be None and the flags will be the empty
         string.
         '''
-        diff = {}
+        return self._lm.diff(m2._lm, clean)
 
-        for fn, n1 in self.iteritems():
-            fl1 = self._flags.get(fn, '')
-            n2 = m2.get(fn, None)
-            fl2 = m2._flags.get(fn, '')
-            if n2 is None:
-                fl2 = ''
-            if n1 != n2 or fl1 != fl2:
-                diff[fn] = ((n1, fl1), (n2, fl2))
-            elif clean:
-                diff[fn] = None
+    def setflag(self, key, flag):
+        self._lm[key] = self[key], flag
 
-        for fn, n2 in m2.iteritems():
-            if fn not in self:
-                fl2 = m2._flags.get(fn, '')
-                diff[fn] = ((None, ''), (n2, fl2))
+    def get(self, key, default=None):
+        try:
+            return self._lm[key][0]
+        except KeyError:
+            return default
 
-        return diff
+    def flags(self, key, default=''):
+        try:
+            return self._lm[key][1]
+        except KeyError:
+            return default
+
+    def __iter__(self):
+        return (x[0] for x in self._lm)
+
+    def copy(self):
+        c = lazymanifest('')
+        c._lm = self._lm.copy()
+        return c
+
+    def iteritems(self):
+        return (x[:2] for x in self._lm)
 
     def text(self):
-        """Get the full data of this manifest as a bytestring."""
-        fl = sorted(self)
-        _checkforbidden(fl)
-
-        hex, flags = revlog.hex, self.flags
-        # if this is changed to support newlines in filenames,
-        # be sure to check the templates/ dir again (especially *-raw.tmpl)
-        return ''.join("%s\0%s%s\n" % (f, hex(self[f]), flags(f)) for f in fl)
+        return self._lm.text()
 
     def fastdelta(self, base, changes):
         """Given a base manifest text as an array.array and a list of changes
@@ -123,7 +213,8 @@  class manifestdict(dict):
             # bs will either be the index of the item or the insert point
             start, end = _msearch(addbuf, f, start)
             if not todelete:
-                l = "%s\0%s%s\n" % (f, revlog.hex(self[f]), self.flags(f))
+                h, fl = self._lm[f]
+                l = "%s\0%s%s\n" % (f, revlog.hex(h), fl)
             else:
                 if start == end:
                     # item we want to delete was not found, error out
@@ -217,11 +308,6 @@  def _addlistdelta(addlist, x):
                    + content for start, end, content in x)
     return deltatext, newaddlist
 
-def _parse(lines):
-    mfdict = manifestdict()
-    parsers.parse_manifest(mfdict, mfdict._flags, lines)
-    return mfdict
-
 class manifest(revlog.revlog):
     def __init__(self, opener):
         # During normal operations, we expect to deal with not more than four
@@ -236,7 +322,8 @@  class manifest(revlog.revlog):
 
     def readdelta(self, node):
         r = self.rev(node)
-        return _parse(mdiff.patchtext(self.revdiff(self.deltaparent(r), r)))
+        data = mdiff.patchtext(self.revdiff(self.deltaparent(r), r))
+        return lazymanifest(data)
 
     def readfast(self, node):
         '''use the faster of readdelta or read'''
@@ -248,30 +335,28 @@  class manifest(revlog.revlog):
 
     def read(self, node):
         if node == revlog.nullid:
-            return manifestdict() # don't upset local cache
+            return lazymanifest('') # don't upset local cache
         if node in self._mancache:
             return self._mancache[node][0]
         text = self.revision(node)
         arraytext = array.array('c', text)
-        m = _parse(text)
-        self._mancache[node] = (m, arraytext)
-        return m
+        mapping = lazymanifest(text)
+        self._mancache[node] = (mapping, arraytext)
+        return mapping
 
     def find(self, node, f):
         '''look up entry for a single file efficiently.
         return (node, flags) pair if found, (None, None) if not.'''
         if node in self._mancache:
-            m = self._mancache[node][0]
-            return m.get(f), m.flags(f)
+            mapping = self._mancache[node][0]
+            return mapping.get(f), mapping.flags(f)
         text = self.revision(node)
-        start, end = _msearch(text, f)
-        if start == end:
+        try:
+            return lazymanifest(text)._lm[f]
+        except KeyError:
             return None, None
-        l = text[start:end]
-        f, n = l.split('\0')
-        return revlog.bin(n[:40]), n[40:-1]
 
-    def add(self, m, transaction, link, p1, p2, added, removed):
+    def add(self, map, transaction, link, p1, p2, added, removed):
         if p1 in self._mancache:
             # If our first parent is in the manifest cache, we can
             # compute a delta here using properties we know about the
@@ -286,7 +371,7 @@  class manifest(revlog.revlog):
             # since the lists are already sorted
             work.sort()
 
-            arraytext, deltatext = m.fastdelta(self._mancache[p1][1], work)
+            arraytext, deltatext = map.fastdelta(self._mancache[p1][1], work)
             cachedelta = self.rev(p1), deltatext
             text = util.buffer(arraytext)
         else:
@@ -294,11 +379,11 @@  class manifest(revlog.revlog):
             # just encode a fulltext of the manifest and pass that
             # through to the revlog layer, and let it handle the delta
             # process.
-            text = m.text()
+            text = map.text()
             arraytext = array.array('c', text)
             cachedelta = None
 
         n = self.addrevision(text, transaction, link, p1, p2, cachedelta)
-        self._mancache[n] = (m, arraytext)
+        self._mancache[n] = (map, arraytext)
 
         return n
diff --git a/mercurial/pure/parsers.py b/mercurial/pure/parsers.py
--- a/mercurial/pure/parsers.py
+++ b/mercurial/pure/parsers.py
@@ -5,7 +5,7 @@ 
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.
 
-from mercurial.node import bin, nullid
+from mercurial.node import nullid
 from mercurial import util
 import struct, zlib, cStringIO
 
@@ -21,15 +21,6 @@  def dirstatetuple(*x):
     # x is a tuple
     return x
 
-def parse_manifest(mfdict, fdict, lines):
-    for l in lines.splitlines():
-        f, n = l.split('\0')
-        if len(n) > 40:
-            fdict[f] = n[40:]
-            mfdict[f] = bin(n[:40])
-        else:
-            mfdict[f] = bin(n)
-
 def parse_index2(data, inline):
     def gettype(q):
         return int(q & 0xFFFF)
diff --git a/tests/test-manifest.py b/tests/test-manifest.py
--- a/tests/test-manifest.py
+++ b/tests/test-manifest.py
@@ -4,7 +4,7 @@  import itertools
 
 import silenttestrunner
 
-from mercurial import parsers
+from mercurial import manifest as manifestmod
 
 HASH_1 = '1' * 40
 HASH_2 = 'f' * 40
@@ -38,12 +38,12 @@  class testmanifest(unittest.TestCase):
         self.assert_(thing in container, msg)
 
     def testEmptyManifest(self):
-        m = parsers.lazymanifest('')
+        m = manifestmod._lazymanifest('')
         self.assertEqual(0, len(m))
         self.assertEqual([], list(m))
 
     def testManifest(self):
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         want = [
             ('bar/baz/qux.py', binascii.unhexlify(HASH_2), 'l'),
             ('foo', binascii.unhexlify(HASH_1), ''),
@@ -58,13 +58,13 @@  class testmanifest(unittest.TestCase):
     def testSetItem(self):
         want = binascii.unhexlify(HASH_1), ''
 
-        m = parsers.lazymanifest('')
+        m = manifestmod._lazymanifest('')
         m['a'] = want
         self.assertIn('a', m)
         self.assertEqual(want, m['a'])
         self.assertEqual('a\0' + HASH_1 + '\n', m.text())
 
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         m['a'] = want
         self.assertEqual(want, m['a'])
         self.assertEqual('a\0' + HASH_1 + '\n' + A_SHORT_MANIFEST,
@@ -76,7 +76,7 @@  class testmanifest(unittest.TestCase):
     def testCompaction(self):
         unhex = binascii.unhexlify
         h1, h2 = unhex(HASH_1), unhex(HASH_2)
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         m['alpha'] = h1, ''
         m['beta'] = h2, ''
         del m['foo']
@@ -91,8 +91,8 @@  class testmanifest(unittest.TestCase):
         self.assertEqual(w, list(m))
 
     def testSetGetNodeSuffix(self):
-        clean = parsers.lazymanifest(A_SHORT_MANIFEST)
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        clean = manifestmod._lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         h, f = m['foo']
         want = h + 'a', f
         # Merge code wants to set 21-byte fake hashes at times
@@ -120,7 +120,7 @@  class testmanifest(unittest.TestCase):
         self.assertEqual({'foo': ((h, ''), want)}, clean.diff(m))
 
     def testFilterCopyException(self):
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         def filt(path):
             if path == 'foo':
                 assert False
@@ -128,7 +128,7 @@  class testmanifest(unittest.TestCase):
         self.assertRaises(AssertionError, m.filtercopy, filt)
 
     def testRemoveItem(self):
-        m = parsers.lazymanifest(A_SHORT_MANIFEST)
+        m = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         del m['foo']
         self.assertRaises(KeyError, lambda : m['foo'])
         self.assertEqual(1, len(m))
@@ -138,9 +138,9 @@  class testmanifest(unittest.TestCase):
         MISSING = (None, '')
         addl = 'z-only-in-left\0' + HASH_1 + '\n'
         addr = 'z-only-in-right\0' + HASH_2 + 'x\n'
-        left = parsers.lazymanifest(
+        left = manifestmod._lazymanifest(
             A_SHORT_MANIFEST.replace(HASH_1, HASH_3 + 'x') + addl)
-        right = parsers.lazymanifest(A_SHORT_MANIFEST + addr)
+        right = manifestmod._lazymanifest(A_SHORT_MANIFEST + addr)
         want = {
             'foo': ((binascii.unhexlify(HASH_3), 'x'),
                     (binascii.unhexlify(HASH_1), '')),
@@ -154,14 +154,14 @@  class testmanifest(unittest.TestCase):
             'foo': (MISSING, (binascii.unhexlify(HASH_3), 'x')),
             'z-only-in-left': (MISSING, (binascii.unhexlify(HASH_1), '')),
             }
-        self.assertEqual(want, parsers.lazymanifest('').diff(left))
+        self.assertEqual(want, manifestmod._lazymanifest('').diff(left))
 
         want = {
             'bar/baz/qux.py': ((binascii.unhexlify(HASH_2), 'l'), MISSING),
             'foo': ((binascii.unhexlify(HASH_3), 'x'), MISSING),
             'z-only-in-left': ((binascii.unhexlify(HASH_1), ''), MISSING),
             }
-        self.assertEqual(want, left.diff(parsers.lazymanifest('')))
+        self.assertEqual(want, left.diff(manifestmod._lazymanifest('')))
         copy = right.copy()
         del copy['z-only-in-right']
         del right['foo']
@@ -171,7 +171,7 @@  class testmanifest(unittest.TestCase):
             }
         self.assertEqual(want, right.diff(copy))
 
-        short = parsers.lazymanifest(A_SHORT_MANIFEST)
+        short = manifestmod._lazymanifest(A_SHORT_MANIFEST)
         pruned = short.copy()
         del pruned['foo']
         want = {
@@ -192,30 +192,37 @@  class testmanifest(unittest.TestCase):
         backwards = ''.join(
             l + '\n' for l in reversed(A_SHORT_MANIFEST.split('\n')) if l)
         try:
-            parsers.lazymanifest(backwards)
+            manifestmod._lazymanifest(backwards)
             self.fail('Should have raised ValueError')
         except ValueError, v:
             self.assertIn('Manifest lines not in sorted order.', str(v))
 
     def testNoTerminalNewline(self):
         try:
-            parsers.lazymanifest(A_SHORT_MANIFEST + 'wat')
+            manifestmod._lazymanifest(A_SHORT_MANIFEST + 'wat')
             self.fail('Should have raised ValueError')
         except ValueError, v:
             self.assertIn('Manifest did not end in a newline.', str(v))
 
     def testNoNewLineAtAll(self):
         try:
-            parsers.lazymanifest('wat')
+            manifestmod._lazymanifest('wat')
             self.fail('Should have raised ValueError')
         except ValueError, v:
             self.assertIn('Manifest did not end in a newline.', str(v))
 
     def testHugeManifest(self):
-        m = parsers.lazymanifest(A_HUGE_MANIFEST)
+        m = manifestmod._lazymanifest(A_HUGE_MANIFEST)
         self.assertEqual(HUGE_MANIFEST_ENTRIES, len(m))
         self.assertEqual(len(m), len(list(m)))
 
+    def testIntersectFiles(self):
+        m = manifestmod.lazymanifest(A_HUGE_MANIFEST)
+        m2 = m.intersectfiles(['file1', 'file200', 'file300'])
+        w = ('file1\0%sx\n'
+             'file200\0%sl\n'
+             'file300\0%s\n') % (HASH_2, HASH_1, HASH_1)
+        self.assertEqual(w, m2.text())
 
 if __name__ == '__main__':
     silenttestrunner.main(__name__)