Patchwork lazymanifest: write a more efficient, pypy friendly version of lazymanifest

login
register
mail settings
Submitter Maciej Fijalkowski
Date Sept. 12, 2016, 12:36 p.m.
Message ID <a43b7edeb6d66afc5c4e.1473683779@brick.arcode.com>
Download mbox | patch
Permalink /patch/16590/
State Accepted
Headers show

Comments

Maciej Fijalkowski - Sept. 12, 2016, 12:36 p.m.
# HG changeset patch
# User Maciej Fijalkowski <fijall@gmail.com>
# Date 1473680234 -7200
#      Mon Sep 12 13:37:14 2016 +0200
# Node ID a43b7edeb6d66afc5c4eab9e4c8bb35b94cbfa97
# Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
lazymanifest: write a more efficient, pypy friendly version of lazymanifest
Augie Fackler - Sept. 12, 2016, 1:13 p.m.
On Mon, Sep 12, 2016 at 02:36:19PM +0200, Maciej Fijalkowski wrote:
> # HG changeset patch
> # User Maciej Fijalkowski <fijall@gmail.com>
> # Date 1473680234 -7200
> #      Mon Sep 12 13:37:14 2016 +0200
> # Node ID a43b7edeb6d66afc5c4eab9e4c8bb35b94cbfa97
> # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
> lazymanifest: write a more efficient, pypy friendly version of lazymanifest

Queued this. Looks like a clear improvement over what was there
before, even with the comments that have questions.

>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -104,69 +104,298 @@
>      _checkforbidden(files)
>      return ''.join(lines)
>
> -class _lazymanifest(dict):
> -    """This is the pure implementation of lazymanifest.
> -
> -    It has not been optimized *at all* and is not lazy.
> -    """
> -
> -    def __init__(self, data):
> -        dict.__init__(self)
> -        for f, n, fl in _parse(data):
> -            self[f] = n, fl
> -
> -    def __setitem__(self, k, v):
> -        node, flag = v
> -        assert node is not None
> -        if len(node) > 21:
> -            node = node[:21] # match c implementation behavior
> -        dict.__setitem__(self, k, (node, flag))
> +class LazyManifestIter(object):
> +    def __init__(self, lm):
> +        self.pos = 0
> +        self.lm = lm
>
>      def __iter__(self):
> -        return iter(sorted(dict.keys(self)))
> +        return self
>
> -    def iterkeys(self):
> -        return iter(sorted(dict.keys(self)))
> +    def next(self):
> +        try:
> +            data, pos = self.lm._get(self.pos)
> +        except IndexError:
> +            raise StopIteration
> +        if pos == -1:
> +            self.pos += 1
> +            return data[0]
> +        self.pos += 1
> +        zeropos = data.find('\x00', pos)
> +        return data[pos:zeropos]
>
> -    def iterentries(self):
> -        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
> +class LazyManifestIterEntries(object):
> +    def __init__(self, lm):
> +        self.lm = lm
> +        self.pos = 0
> +
> +    def __iter__(self):
> +        return self
> +
> +    def next(self):
> +        try:
> +            data, pos = self.lm._get(self.pos)
> +        except IndexError:
> +            raise StopIteration
> +        if pos == -1:
> +            self.pos += 1
> +            return data
> +        zeropos = data.find('\x00', pos)
> +        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
> +                            zeropos + 1, 40)
> +        flags = self.lm._getflags(data, self.pos, zeropos)
> +        self.pos += 1
> +        return (data[pos:zeropos], hashval, flags)
> +
> +def unhexlify(data, extra, pos, length):
> +    s = data[pos:pos + length].decode('hex')
> +    if extra:
> +        s += chr(extra & 0xff)
> +    return s
> +
> +def _cmp(a, b):
> +    return (a > b) - (a < b)
> +
> +class _lazymanifest(object):
> +    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
> +        if positions is None:
> +            self.positions = self.findlines(data)
> +            self.extrainfo = [0] * len(self.positions)
> +            self.data = data
> +            self.extradata = []
> +        else:
> +            self.positions = positions[:]
> +            self.extrainfo = extrainfo[:]
> +            self.extradata = extradata[:]
> +            self.data = data
> +
> +    def findlines(self, data):
> +        if not data:
> +            return []
> +        pos = data.find("\n")
> +        positions = [0]
> +        while pos < len(data) - 1 and pos != -1:
> +            positions.append(pos + 1)
> +            pos = data.find("\n", pos + 1)
> +        return positions
> +
> +    def _get(self, index):
> +        # get the position encoded in pos:
> +        #   positive number is an index in 'data'
> +        #   negative number is in extrapieces
> +        pos = self.positions[index]
> +        if pos >= 0:
> +            return self.data, pos
> +        return self.extradata[-pos - 1], -1
> +
> +    def _getkey(self, pos):
> +        if pos >= 0:
> +            return self.data[pos:self.data.find('\x00', pos + 1)]
> +        return self.extradata[-pos - 1][0]
> +
> +    def bsearch(self, key):
> +        first = 0
> +        last = len(self.positions) - 1
> +
> +        while first <= last:
> +            midpoint = (first + last)//2
> +            nextpos = self.positions[midpoint]
> +            candidate = self._getkey(nextpos)
> +            r = _cmp(key, candidate)
> +            if r == 0:
> +                return midpoint
> +            else:
> +                if r < 0:
> +                    last = midpoint - 1
> +                else:
> +                    first = midpoint + 1
> +        return -1
> +
> +    def bsearch2(self, key):
> +        # same as the above, but will always return the position
> +        # done for performance reasons
> +        first = 0
> +        last = len(self.positions) - 1
> +
> +        while first <= last:
> +            midpoint = (first + last)//2
> +            nextpos = self.positions[midpoint]
> +            candidate = self._getkey(nextpos)
> +            r = _cmp(key, candidate)
> +            if r == 0:
> +                return (midpoint, True)
> +            else:
> +                if r < 0:
> +                    last = midpoint - 1
> +                else:
> +                    first = midpoint + 1
> +        return (first, False)
> +
> +    def __contains__(self, key):
> +        return self.bsearch(key) != -1
> +
> +    def _getflags(self, data, needle, pos):
> +        start = pos + 41
> +        if needle < 0 or needle == len(self.positions) - 1:
> +            end = len(data) - 1
> +        else:
> +            end = data.find("\n", start)
> +        if start == end:
> +            return ''
> +        return self.data[start:end]
> +
> +    def __getitem__(self, key):
> +        if not isinstance(key, str):
> +            raise TypeError("setitem: manifest keys must be a string.")
> +        needle = self.bsearch(key)
> +        if needle == -1:
> +            raise KeyError
> +        data, pos = self._get(needle)
> +        if pos == -1:
> +            return (data[1], data[2])
> +        zeropos = data.find('\x00', pos)
> +        assert 0 <= needle <= len(self.positions)
> +        assert len(self.extrainfo) == len(self.positions)
> +        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
> +        flags = self._getflags(data, needle, zeropos)
> +        return (hashval, flags)
> +
> +    def __delitem__(self, key):
> +        needle, found = self.bsearch2(key)
> +        if not found:
> +            raise KeyError
> +        cur = self.positions[needle]
> +        self.positions = self.positions[:needle] + self.positions[needle + 1:]
> +        self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
> +        if cur >= 0:
> +            self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
> +
> +    def __setitem__(self, key, value):
> +        if not isinstance(key, str):
> +            raise TypeError("setitem: manifest keys must be a string.")
> +        if not isinstance(value, tuple) or len(value) != 2:
> +            raise TypeError("Manifest values must be a tuple of (node, flags).")
> +        hashval = value[0]
> +        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
> +            raise TypeError("node must be a 20-byte string")
> +        flags = value[1]
> +        if not isinstance(flags, str) or len(flags) > 1:
> +            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
> +        needle, found = self.bsearch2(key)
> +        if found:
> +            # put the item
> +            pos = self.positions[needle]
> +            if pos < 0:
> +                self.extradata[-pos - 1] = (key, value[0], value[1])
> +            else:
> +                # just don't bother
> +                self.extradata.append((key, value[0], value[1]))
> +                self.positions[needle] = -len(self.extradata)
> +        else:
> +            # not found, put it in with extra positions
> +            self.extradata.append((key, value[0], value[1]))
> +            self.positions = (self.positions[:needle] + [-len(self.extradata)]
> +                              + self.positions[needle:])
> +            self.extrainfo = (self.extrainfo[:needle] + [0] +
> +                              self.extrainfo[needle:])
>
>      def copy(self):
> -        c = _lazymanifest('')
> -        c.update(self)
> -        return c
> +        # XXX call _compact like in C?
> +        return _lazymanifest(self.data, self.positions, self.extrainfo,
> +            self.extradata)
> +
> +    def _compact(self):
> +        # hopefully not called TOO often
> +        def findnextposition(positions, start, lgt):
> +            i = start
> +            while i < len(positions):
> +                if positions[i] >= 0:
> +                    return positions[i]
> +                i += 1
> +            return lgt
> +
> +        if len(self.extradata) == 0:
> +            return
> +        l = []
> +        last_cut = 0
> +        i = 0
> +        offset = 0
> +        self.extrainfo = [0] * len(self.positions)
> +        while i < len(self.positions):
> +            if self.positions[i] >= 0:
> +                cur = self.positions[i]
> +                last_cut = cur
> +                while True:
> +                    self.positions[i] = offset
> +                    i += 1
> +                    if i == len(self.positions) or self.positions[i] < 0:
> +                        break
> +                    offset += self.positions[i] - cur
> +                    cur = self.positions[i]
> +                end_cut = findnextposition(self.positions, i, len(self.data))
> +                offset += end_cut - cur
> +                l.append(self.data[last_cut:end_cut])
> +            else:
> +                while i < len(self.positions) and self.positions[i] < 0:
> +                    cur = self.positions[i]
> +                    t = self.extradata[-cur - 1]
> +                    l.append(self._pack(t))
> +                    self.positions[i] = offset
> +                    if len(t[1]) > 20:
> +                        self.extrainfo[i] = ord(t[1][21])
> +                    offset += len(l[-1])
> +                    i += 1
> +        self.data = ''.join(l)
> +        self.extradata = []
> +
> +    def _pack(self, d):
> +        return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
> +
> +    def text(self):
> +        self._compact()
> +        return self.data
>
>      def diff(self, m2, clean=False):
>          '''Finds changes between the current manifest and m2.'''
> +        # XXX think whether efficiency matters here
>          diff = {}
>
> -        for fn, e1 in self.iteritems():
> +        for fn, e1, flags in self.iterentries():
>              if fn not in m2:
> -                diff[fn] = e1, (None, '')
> +                diff[fn] = (e1, flags), (None, '')
>              else:
>                  e2 = m2[fn]
> -                if e1 != e2:
> -                    diff[fn] = e1, e2
> +                if (e1, flags) != e2:
> +                    diff[fn] = (e1, flags), e2
>                  elif clean:
>                      diff[fn] = None
>
> -        for fn, e2 in m2.iteritems():
> +        for fn, e2, flags in m2.iterentries():
>              if fn not in self:
> -                diff[fn] = (None, ''), e2
> +                diff[fn] = (None, ''), (e2, flags)
>
>          return diff
>
> +    def iterentries(self):
> +        return LazyManifestIterEntries(self)
> +
> +    def iterkeys(self):
> +        return LazyManifestIter(self)
> +
> +    def __iter__(self):
> +        return LazyManifestIter(self)
> +
> +    def __len__(self):
> +        return len(self.positions)
> +
>      def filtercopy(self, filterfn):
> +        # XXX can be optimized
>          c = _lazymanifest('')
>          for f, n, fl in self.iterentries():
>              if filterfn(f):
>                  c[f] = n, fl
>          return c
>
> -    def text(self):
> -        """Get the full data of this manifest as a bytestring."""
> -        return _textv1(self.iterentries())
> -
>  try:
>      _lazymanifest = parsers.lazymanifest
>  except AttributeError:
> diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
> --- a/mercurial/pure/bdiff.py
> +++ b/mercurial/pure/bdiff.py
> @@ -111,8 +111,8 @@
>          def blocks(sa, sb):
>              a = ffi.new("struct bdiff_line**")
>              b = ffi.new("struct bdiff_line**")
> -            ac = ffi.new("char[]", sa)
> -            bc = ffi.new("char[]", sb)
> +            ac = ffi.new("char[]", str(sa))
> +            bc = ffi.new("char[]", str(sb))
>              l = ffi.new("struct bdiff_hunk*")
>              try:
>                  an = lib.bdiff_splitlines(ac, len(sa), a)
> @@ -138,8 +138,8 @@
>          def bdiff(sa, sb):
>              a = ffi.new("struct bdiff_line**")
>              b = ffi.new("struct bdiff_line**")
> -            ac = ffi.new("char[]", sa)
> -            bc = ffi.new("char[]", sb)
> +            ac = ffi.new("char[]", str(sa))
> +            bc = ffi.new("char[]", str(sb))
>              l = ffi.new("struct bdiff_hunk*")
>              try:
>                  an = lib.bdiff_splitlines(ac, len(sa), a)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Pierre-Yves David - Sept. 13, 2016, 10:47 a.m.
On 09/12/2016 03:13 PM, Augie Fackler wrote:
> On Mon, Sep 12, 2016 at 02:36:19PM +0200, Maciej Fijalkowski wrote:
>> # HG changeset patch
>> # User Maciej Fijalkowski <fijall@gmail.com>
>> # Date 1473680234 -7200
>> #      Mon Sep 12 13:37:14 2016 +0200
>> # Node ID a43b7edeb6d66afc5c4eab9e4c8bb35b94cbfa97
>> # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
>> lazymanifest: write a more efficient, pypy friendly version of lazymanifest
>
> Queued this. Looks like a clear improvement over what was there
> before, even with the comments that have questions.

The lack of documentation makes it a bit hard to review but it seems 
overall correct, The LazyManifestIter and LazyManifestIterEntries class 
name are wrong as we don't use CamelCase (so lazymanifestiter and 
lazymanifestiterentries). There is also a small error in the __getitem__ 
error message (says setitem)

However, tests do not pass with this patch so I've dropped it.

Failed test-remove.t: output changed
Failed test-subrepo-deep-nested-change.t: output changed
Failed test-backout.t: output changed
Failed test-rebase-detach.t: output changed
Failed test-transplant.t: output changed
Failed test-manifest.py: output changed and returned error code 1
Failed test-update-issue1456.t: output changed
Pierre-Yves David - Sept. 13, 2016, 10:52 a.m.
On 09/13/2016 12:47 PM, Pierre-Yves David wrote:
>
>
> On 09/12/2016 03:13 PM, Augie Fackler wrote:
>> On Mon, Sep 12, 2016 at 02:36:19PM +0200, Maciej Fijalkowski wrote:
>>> # HG changeset patch
>>> # User Maciej Fijalkowski <fijall@gmail.com>
>>> # Date 1473680234 -7200
>>> #      Mon Sep 12 13:37:14 2016 +0200
>>> # Node ID a43b7edeb6d66afc5c4eab9e4c8bb35b94cbfa97
>>> # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
>>> lazymanifest: write a more efficient, pypy friendly version of
>>> lazymanifest
>>
>> Queued this. Looks like a clear improvement over what was there
>> before, even with the comments that have questions.
>
> The lack of documentation makes it a bit hard to review but it seems
> overall correct, The LazyManifestIter and LazyManifestIterEntries class
> name are wrong as we don't use CamelCase (so lazymanifestiter and
> lazymanifestiterentries). There is also a small error in the __getitem__
> error message (says setitem)
>
> However, tests do not pass with this patch so I've dropped it.

fijal, you can `hg pull 
https://www.mercurial-scm.org/repo/users/marmoute/mercurial-dropped/ -r 
bbc5c39158c8` get the version with the style and message version 
highlighted above.
Pierre-Yves David - Sept. 13, 2016, 10:59 a.m.
On 09/13/2016 12:52 PM, Pierre-Yves David wrote:
>
>
> On 09/13/2016 12:47 PM, Pierre-Yves David wrote:
>>
>>
>> On 09/12/2016 03:13 PM, Augie Fackler wrote:
>>> On Mon, Sep 12, 2016 at 02:36:19PM +0200, Maciej Fijalkowski wrote:
>>>> # HG changeset patch
>>>> # User Maciej Fijalkowski <fijall@gmail.com>
>>>> # Date 1473680234 -7200
>>>> #      Mon Sep 12 13:37:14 2016 +0200
>>>> # Node ID a43b7edeb6d66afc5c4eab9e4c8bb35b94cbfa97
>>>> # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
>>>> lazymanifest: write a more efficient, pypy friendly version of
>>>> lazymanifest
>>>
>>> Queued this. Looks like a clear improvement over what was there
>>> before, even with the comments that have questions.
>>
>> The lack of documentation makes it a bit hard to review but it seems
>> overall correct, The LazyManifestIter and LazyManifestIterEntries class
>> name are wrong as we don't use CamelCase (so lazymanifestiter and
>> lazymanifestiterentries). There is also a small error in the __getitem__
>> error message (says setitem)
>>
>> However, tests do not pass with this patch so I've dropped it.
>
> fijal, you can `hg pull
> https://www.mercurial-scm.org/repo/users/marmoute/mercurial-dropped/ -r
> bbc5c39158c8` get the version with the style and message version
> highlighted above.

Gah, make this -r bbc5c39158c8 because I cannot copy paste properly :-/
Maciej Fijalkowski - Sept. 17, 2016, 9:17 a.m.
Hi

When I'm trying to do that, I get that:

https://paste.pound-python.org/show/HZsFNvYPt8EAH2ugETpx/

and then it hangs (for both system hg and local hg)

On Tue, Sep 13, 2016 at 12:59 PM, Pierre-Yves David
<pierre-yves.david@ens-lyon.org> wrote:
>
>
> On 09/13/2016 12:52 PM, Pierre-Yves David wrote:
>>
>>
>>
>> On 09/13/2016 12:47 PM, Pierre-Yves David wrote:
>>>
>>>
>>>
>>> On 09/12/2016 03:13 PM, Augie Fackler wrote:
>>>>
>>>> On Mon, Sep 12, 2016 at 02:36:19PM +0200, Maciej Fijalkowski wrote:
>>>>>
>>>>> # HG changeset patch
>>>>> # User Maciej Fijalkowski <fijall@gmail.com>
>>>>> # Date 1473680234 -7200
>>>>> #      Mon Sep 12 13:37:14 2016 +0200
>>>>> # Node ID a43b7edeb6d66afc5c4eab9e4c8bb35b94cbfa97
>>>>> # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
>>>>> lazymanifest: write a more efficient, pypy friendly version of
>>>>> lazymanifest
>>>>
>>>>
>>>> Queued this. Looks like a clear improvement over what was there
>>>> before, even with the comments that have questions.
>>>
>>>
>>> The lack of documentation makes it a bit hard to review but it seems
>>> overall correct, The LazyManifestIter and LazyManifestIterEntries class
>>> name are wrong as we don't use CamelCase (so lazymanifestiter and
>>> lazymanifestiterentries). There is also a small error in the __getitem__
>>> error message (says setitem)
>>>
>>> However, tests do not pass with this patch so I've dropped it.
>>
>>
>> fijal, you can `hg pull
>> https://www.mercurial-scm.org/repo/users/marmoute/mercurial-dropped/ -r
>> bbc5c39158c8` get the version with the style and message version
>> highlighted above.
>
>
> Gah, make this -r bbc5c39158c8 because I cannot copy paste properly :-/
>
> --
> Pierre-Yves David
Pierre-Yves David - Sept. 17, 2016, 10:25 a.m.
On 09/17/2016 11:17 AM, Maciej Fijalkowski wrote:
> Hi
>
> When I'm trying to do that, I get that:
>
> https://paste.pound-python.org/show/HZsFNvYPt8EAH2ugETpx/
>
> and then it hangs (for both system hg and local hg)

Can you add --debug to get a bit more data?
Maciej Fijalkowski - Sept. 17, 2016, 11:44 a.m.
https://paste.pound-python.org/show/JB0RSPFycVzWS8X6C7hc/

On Sat, Sep 17, 2016 at 12:25 PM, Pierre-Yves David
<pierre-yves.david@ens-lyon.org> wrote:
>
>
> On 09/17/2016 11:17 AM, Maciej Fijalkowski wrote:
>>
>> Hi
>>
>> When I'm trying to do that, I get that:
>>
>> https://paste.pound-python.org/show/HZsFNvYPt8EAH2ugETpx/
>>
>> and then it hangs (for both system hg and local hg)
>
>
> Can you add --debug to get a bit more data?
>
> --
> Pierre-Yves David
Maciej Fijalkowski - Sept. 17, 2016, 11:45 a.m.
ok, I think at the end it is that:

received listkey for "obsolete": 16746270 bytes

and it takes a while, but there is no progress nothing which seems
like it's hanging

On Sat, Sep 17, 2016 at 1:44 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> https://paste.pound-python.org/show/JB0RSPFycVzWS8X6C7hc/
>
> On Sat, Sep 17, 2016 at 12:25 PM, Pierre-Yves David
> <pierre-yves.david@ens-lyon.org> wrote:
>>
>>
>> On 09/17/2016 11:17 AM, Maciej Fijalkowski wrote:
>>>
>>> Hi
>>>
>>> When I'm trying to do that, I get that:
>>>
>>> https://paste.pound-python.org/show/HZsFNvYPt8EAH2ugETpx/
>>>
>>> and then it hangs (for both system hg and local hg)
>>
>>
>> Can you add --debug to get a bit more data?
>>
>> --
>> Pierre-Yves David
Maciej Fijalkowski - Sept. 17, 2016, 11:47 a.m.
managed to pull that. How do I incorporate my fixes for the problems now?

On Sat, Sep 17, 2016 at 1:45 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> ok, I think at the end it is that:
>
> received listkey for "obsolete": 16746270 bytes
>
> and it takes a while, but there is no progress nothing which seems
> like it's hanging
>
> On Sat, Sep 17, 2016 at 1:44 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
>> https://paste.pound-python.org/show/JB0RSPFycVzWS8X6C7hc/
>>
>> On Sat, Sep 17, 2016 at 12:25 PM, Pierre-Yves David
>> <pierre-yves.david@ens-lyon.org> wrote:
>>>
>>>
>>> On 09/17/2016 11:17 AM, Maciej Fijalkowski wrote:
>>>>
>>>> Hi
>>>>
>>>> When I'm trying to do that, I get that:
>>>>
>>>> https://paste.pound-python.org/show/HZsFNvYPt8EAH2ugETpx/
>>>>
>>>> and then it hangs (for both system hg and local hg)
>>>
>>>
>>> Can you add --debug to get a bit more data?
>>>
>>> --
>>> Pierre-Yves David
Maciej Fijalkowski - Sept. 17, 2016, 11:52 a.m.
I have fixed the problem with the test that you listed. I'm still
completely unable to finish the test run - with -j it randomly hangs
and does nothing with no extra output

On Sat, Sep 17, 2016 at 1:47 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> managed to pull that. How do I incorporate my fixes for the problems now?
>
> On Sat, Sep 17, 2016 at 1:45 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
>> ok, I think at the end it is that:
>>
>> received listkey for "obsolete": 16746270 bytes
>>
>> and it takes a while, but there is no progress nothing which seems
>> like it's hanging
>>
>> On Sat, Sep 17, 2016 at 1:44 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
>>> https://paste.pound-python.org/show/JB0RSPFycVzWS8X6C7hc/
>>>
>>> On Sat, Sep 17, 2016 at 12:25 PM, Pierre-Yves David
>>> <pierre-yves.david@ens-lyon.org> wrote:
>>>>
>>>>
>>>> On 09/17/2016 11:17 AM, Maciej Fijalkowski wrote:
>>>>>
>>>>> Hi
>>>>>
>>>>> When I'm trying to do that, I get that:
>>>>>
>>>>> https://paste.pound-python.org/show/HZsFNvYPt8EAH2ugETpx/
>>>>>
>>>>> and then it hangs (for both system hg and local hg)
>>>>
>>>>
>>>> Can you add --debug to get a bit more data?
>>>>
>>>> --
>>>> Pierre-Yves David
Pierre-Yves David - Sept. 28, 2016, 12:09 p.m.
On 09/17/2016 01:45 PM, Maciej Fijalkowski wrote:
> ok, I think at the end it is that:
>
> received listkey for "obsolete": 16746270 bytes
>
> and it takes a while, but there is no progress nothing which seems
> like it's hanging

This is strange, This means you are using the old (Very inefficient) way 
to pull obsolescence marker. What version of Mercurial and evolve are 
you using?

Cheer,
Maciej Fijalkowski - Sept. 28, 2016, 2:47 p.m.
The one that comes with os x or homebrew I'll update to a newer one I
promise

On 28 Sep 2016 2:09 PM, "Pierre-Yves David" <pierre-yves.david@ens-lyon.org>
wrote:

>
>
> On 09/17/2016 01:45 PM, Maciej Fijalkowski wrote:
>
>> ok, I think at the end it is that:
>>
>> received listkey for "obsolete": 16746270 bytes
>>
>> and it takes a while, but there is no progress nothing which seems
>> like it's hanging
>>
>
> This is strange, This means you are using the old (Very inefficient) way
> to pull obsolescence marker. What version of Mercurial and evolve are you
> using?
>
> Cheer,
>
> --
> Pierre-Yves David
>

Patch

diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -104,69 +104,298 @@ 
     _checkforbidden(files)
     return ''.join(lines)
 
-class _lazymanifest(dict):
-    """This is the pure implementation of lazymanifest.
-
-    It has not been optimized *at all* and is not lazy.
-    """
-
-    def __init__(self, data):
-        dict.__init__(self)
-        for f, n, fl in _parse(data):
-            self[f] = n, fl
-
-    def __setitem__(self, k, v):
-        node, flag = v
-        assert node is not None
-        if len(node) > 21:
-            node = node[:21] # match c implementation behavior
-        dict.__setitem__(self, k, (node, flag))
+class LazyManifestIter(object):
+    def __init__(self, lm):
+        self.pos = 0
+        self.lm = lm
 
     def __iter__(self):
-        return iter(sorted(dict.keys(self)))
+        return self
 
-    def iterkeys(self):
-        return iter(sorted(dict.keys(self)))
+    def next(self):
+        try:
+            data, pos = self.lm._get(self.pos)
+        except IndexError:
+            raise StopIteration
+        if pos == -1:
+            self.pos += 1
+            return data[0]
+        self.pos += 1
+        zeropos = data.find('\x00', pos)
+        return data[pos:zeropos]
 
-    def iterentries(self):
-        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
+class LazyManifestIterEntries(object):
+    def __init__(self, lm):
+        self.lm = lm
+        self.pos = 0
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        try:
+            data, pos = self.lm._get(self.pos)
+        except IndexError:
+            raise StopIteration
+        if pos == -1:
+            self.pos += 1
+            return data
+        zeropos = data.find('\x00', pos)
+        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
+                            zeropos + 1, 40)
+        flags = self.lm._getflags(data, self.pos, zeropos)
+        self.pos += 1
+        return (data[pos:zeropos], hashval, flags)
+
+def unhexlify(data, extra, pos, length):
+    s = data[pos:pos + length].decode('hex')
+    if extra:
+        s += chr(extra & 0xff)
+    return s
+
+def _cmp(a, b):
+    return (a > b) - (a < b)
+
+class _lazymanifest(object):
+    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
+        if positions is None:
+            self.positions = self.findlines(data)
+            self.extrainfo = [0] * len(self.positions)
+            self.data = data
+            self.extradata = []
+        else:
+            self.positions = positions[:]
+            self.extrainfo = extrainfo[:]
+            self.extradata = extradata[:]
+            self.data = data
+
+    def findlines(self, data):
+        if not data:
+            return []
+        pos = data.find("\n")
+        positions = [0]
+        while pos < len(data) - 1 and pos != -1:
+            positions.append(pos + 1)
+            pos = data.find("\n", pos + 1)
+        return positions
+
+    def _get(self, index):
+        # get the position encoded in pos:
+        #   positive number is an index in 'data'
+        #   negative number is in extrapieces
+        pos = self.positions[index]
+        if pos >= 0:
+            return self.data, pos
+        return self.extradata[-pos - 1], -1
+
+    def _getkey(self, pos):
+        if pos >= 0:
+            return self.data[pos:self.data.find('\x00', pos + 1)]
+        return self.extradata[-pos - 1][0]
+
+    def bsearch(self, key):
+        first = 0
+        last = len(self.positions) - 1
+
+        while first <= last:
+            midpoint = (first + last)//2
+            nextpos = self.positions[midpoint]
+            candidate = self._getkey(nextpos)
+            r = _cmp(key, candidate)
+            if r == 0:
+                return midpoint
+            else:
+                if r < 0:
+                    last = midpoint - 1
+                else:
+                    first = midpoint + 1
+        return -1
+
+    def bsearch2(self, key):
+        # same as the above, but will always return the position
+        # done for performance reasons
+        first = 0
+        last = len(self.positions) - 1
+
+        while first <= last:
+            midpoint = (first + last)//2
+            nextpos = self.positions[midpoint]
+            candidate = self._getkey(nextpos)
+            r = _cmp(key, candidate)
+            if r == 0:
+                return (midpoint, True)
+            else:
+                if r < 0:
+                    last = midpoint - 1
+                else:
+                    first = midpoint + 1
+        return (first, False)
+
+    def __contains__(self, key):
+        return self.bsearch(key) != -1
+
+    def _getflags(self, data, needle, pos):
+        start = pos + 41
+        if needle < 0 or needle == len(self.positions) - 1:
+            end = len(data) - 1
+        else:
+            end = data.find("\n", start)
+        if start == end:
+            return ''
+        return self.data[start:end]
+
+    def __getitem__(self, key):
+        if not isinstance(key, str):
+            raise TypeError("setitem: manifest keys must be a string.")
+        needle = self.bsearch(key)
+        if needle == -1:
+            raise KeyError
+        data, pos = self._get(needle)
+        if pos == -1:
+            return (data[1], data[2])
+        zeropos = data.find('\x00', pos)
+        assert 0 <= needle <= len(self.positions)
+        assert len(self.extrainfo) == len(self.positions)
+        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
+        flags = self._getflags(data, needle, zeropos)
+        return (hashval, flags)
+
+    def __delitem__(self, key):
+        needle, found = self.bsearch2(key)
+        if not found:
+            raise KeyError
+        cur = self.positions[needle]
+        self.positions = self.positions[:needle] + self.positions[needle + 1:]
+        self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
+        if cur >= 0:
+            self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
+
+    def __setitem__(self, key, value):
+        if not isinstance(key, str):
+            raise TypeError("setitem: manifest keys must be a string.")
+        if not isinstance(value, tuple) or len(value) != 2:
+            raise TypeError("Manifest values must be a tuple of (node, flags).")
+        hashval = value[0]
+        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
+            raise TypeError("node must be a 20-byte string")
+        flags = value[1]
+        if not isinstance(flags, str) or len(flags) > 1:
+            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
+        needle, found = self.bsearch2(key)
+        if found:
+            # put the item
+            pos = self.positions[needle]
+            if pos < 0:
+                self.extradata[-pos - 1] = (key, value[0], value[1])
+            else:
+                # just don't bother
+                self.extradata.append((key, value[0], value[1]))
+                self.positions[needle] = -len(self.extradata)
+        else:
+            # not found, put it in with extra positions
+            self.extradata.append((key, value[0], value[1]))
+            self.positions = (self.positions[:needle] + [-len(self.extradata)]
+                              + self.positions[needle:])
+            self.extrainfo = (self.extrainfo[:needle] + [0] +
+                              self.extrainfo[needle:])
 
     def copy(self):
-        c = _lazymanifest('')
-        c.update(self)
-        return c
+        # XXX call _compact like in C?
+        return _lazymanifest(self.data, self.positions, self.extrainfo,
+            self.extradata)
+
+    def _compact(self):
+        # hopefully not called TOO often
+        def findnextposition(positions, start, lgt):
+            i = start
+            while i < len(positions):
+                if positions[i] >= 0:
+                    return positions[i]
+                i += 1
+            return lgt
+
+        if len(self.extradata) == 0:
+            return
+        l = []
+        last_cut = 0
+        i = 0
+        offset = 0
+        self.extrainfo = [0] * len(self.positions)
+        while i < len(self.positions):
+            if self.positions[i] >= 0:
+                cur = self.positions[i]
+                last_cut = cur
+                while True:
+                    self.positions[i] = offset
+                    i += 1
+                    if i == len(self.positions) or self.positions[i] < 0:
+                        break
+                    offset += self.positions[i] - cur
+                    cur = self.positions[i]
+                end_cut = findnextposition(self.positions, i, len(self.data))
+                offset += end_cut - cur
+                l.append(self.data[last_cut:end_cut])
+            else:
+                while i < len(self.positions) and self.positions[i] < 0:
+                    cur = self.positions[i]
+                    t = self.extradata[-cur - 1]
+                    l.append(self._pack(t))
+                    self.positions[i] = offset
+                    if len(t[1]) > 20:
+                        self.extrainfo[i] = ord(t[1][21])
+                    offset += len(l[-1])
+                    i += 1
+        self.data = ''.join(l)
+        self.extradata = []
+
+    def _pack(self, d):
+        return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
+
+    def text(self):
+        self._compact()
+        return self.data
 
     def diff(self, m2, clean=False):
         '''Finds changes between the current manifest and m2.'''
+        # XXX think whether efficiency matters here
         diff = {}
 
-        for fn, e1 in self.iteritems():
+        for fn, e1, flags in self.iterentries():
             if fn not in m2:
-                diff[fn] = e1, (None, '')
+                diff[fn] = (e1, flags), (None, '')
             else:
                 e2 = m2[fn]
-                if e1 != e2:
-                    diff[fn] = e1, e2
+                if (e1, flags) != e2:
+                    diff[fn] = (e1, flags), e2
                 elif clean:
                     diff[fn] = None
 
-        for fn, e2 in m2.iteritems():
+        for fn, e2, flags in m2.iterentries():
             if fn not in self:
-                diff[fn] = (None, ''), e2
+                diff[fn] = (None, ''), (e2, flags)
 
         return diff
 
+    def iterentries(self):
+        return LazyManifestIterEntries(self)
+
+    def iterkeys(self):
+        return LazyManifestIter(self)
+
+    def __iter__(self):
+        return LazyManifestIter(self)
+
+    def __len__(self):
+        return len(self.positions)
+
     def filtercopy(self, filterfn):
+        # XXX can be optimized
         c = _lazymanifest('')
         for f, n, fl in self.iterentries():
             if filterfn(f):
                 c[f] = n, fl
         return c
 
-    def text(self):
-        """Get the full data of this manifest as a bytestring."""
-        return _textv1(self.iterentries())
-
 try:
     _lazymanifest = parsers.lazymanifest
 except AttributeError:
diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
--- a/mercurial/pure/bdiff.py
+++ b/mercurial/pure/bdiff.py
@@ -111,8 +111,8 @@ 
         def blocks(sa, sb):
             a = ffi.new("struct bdiff_line**")
             b = ffi.new("struct bdiff_line**")
-            ac = ffi.new("char[]", sa)
-            bc = ffi.new("char[]", sb)
+            ac = ffi.new("char[]", str(sa))
+            bc = ffi.new("char[]", str(sb))
             l = ffi.new("struct bdiff_hunk*")
             try:
                 an = lib.bdiff_splitlines(ac, len(sa), a)
@@ -138,8 +138,8 @@ 
         def bdiff(sa, sb):
             a = ffi.new("struct bdiff_line**")
             b = ffi.new("struct bdiff_line**")
-            ac = ffi.new("char[]", sa)
-            bc = ffi.new("char[]", sb)
+            ac = ffi.new("char[]", str(sa))
+            bc = ffi.new("char[]", str(sb))
             l = ffi.new("struct bdiff_hunk*")
             try:
                 an = lib.bdiff_splitlines(ac, len(sa), a)