Patchwork [v2] lazymanifest: write a more efficient, pypy friendly version of lazymanifest

login
register
mail settings
Submitter Maciej Fijalkowski
Date Sept. 17, 2016, 6:22 p.m.
Message ID <7551f1e60b2155462d89.1474136540@brick.arcode.com>
Download mbox | patch
Permalink /patch/16664/
State Changes Requested
Headers show

Comments

Maciej Fijalkowski - Sept. 17, 2016, 6:22 p.m.
# HG changeset patch
# User Maciej Fijalkowski <fijall@gmail.com>
# Date 1473680234 -7200
#      Mon Sep 12 13:37:14 2016 +0200
# Node ID 7551f1e60b2155462d89a9571eec065e9f67debc
# Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
lazymanifest: write a more efficient, pypy friendly version of lazymanifest
Maciej Fijalkowski - Sept. 17, 2016, 6:23 p.m.
This should fix the issues presented.

There is one problem which is that the hash in test-rebase-detach
changes. The way I see it is just an RNG phase-shift, but it might be
a real bug. Some actual input will be appreciated.



On Sat, Sep 17, 2016 at 8:22 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> # HG changeset patch
> # User Maciej Fijalkowski <fijall@gmail.com>
> # Date 1473680234 -7200
> #      Mon Sep 12 13:37:14 2016 +0200
> # Node ID 7551f1e60b2155462d89a9571eec065e9f67debc
> # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
> lazymanifest: write a more efficient, pypy friendly version of lazymanifest
>
> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> --- a/mercurial/manifest.py
> +++ b/mercurial/manifest.py
> @@ -104,69 +104,297 @@
>      _checkforbidden(files)
>      return ''.join(lines)
>
> -class _lazymanifest(dict):
> -    """This is the pure implementation of lazymanifest.
> -
> -    It has not been optimized *at all* and is not lazy.
> -    """
> -
> -    def __init__(self, data):
> -        dict.__init__(self)
> -        for f, n, fl in _parse(data):
> -            self[f] = n, fl
> -
> -    def __setitem__(self, k, v):
> -        node, flag = v
> -        assert node is not None
> -        if len(node) > 21:
> -            node = node[:21] # match c implementation behavior
> -        dict.__setitem__(self, k, (node, flag))
> +class lazymanifestiter(object):
> +    def __init__(self, lm):
> +        self.pos = 0
> +        self.lm = lm
>
>      def __iter__(self):
> -        return iter(sorted(dict.keys(self)))
> +        return self
>
> -    def iterkeys(self):
> -        return iter(sorted(dict.keys(self)))
> +    def next(self):
> +        try:
> +            data, pos = self.lm._get(self.pos)
> +        except IndexError:
> +            raise StopIteration
> +        if pos == -1:
> +            self.pos += 1
> +            return data[0]
> +        self.pos += 1
> +        zeropos = data.find('\x00', pos)
> +        return data[pos:zeropos]
>
> -    def iterentries(self):
> -        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
> +class lazymanifestiterentries(object):
> +    def __init__(self, lm):
> +        self.lm = lm
> +        self.pos = 0
> +
> +    def __iter__(self):
> +        return self
> +
> +    def next(self):
> +        try:
> +            data, pos = self.lm._get(self.pos)
> +        except IndexError:
> +            raise StopIteration
> +        if pos == -1:
> +            self.pos += 1
> +            return data
> +        zeropos = data.find('\x00', pos)
> +        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
> +                            zeropos + 1, 40)
> +        flags = self.lm._getflags(data, self.pos, zeropos)
> +        self.pos += 1
> +        return (data[pos:zeropos], hashval, flags)
> +
> +def unhexlify(data, extra, pos, length):
> +    s = data[pos:pos + length].decode('hex')
> +    if extra:
> +        s += chr(extra & 0xff)
> +    return s
> +
> +def _cmp(a, b):
> +    return (a > b) - (a < b)
> +
> +class _lazymanifest(object):
> +    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
> +        if positions is None:
> +            self.positions = self.findlines(data)
> +            self.extrainfo = [0] * len(self.positions)
> +            self.data = data
> +            self.extradata = []
> +        else:
> +            self.positions = positions[:]
> +            self.extrainfo = extrainfo[:]
> +            self.extradata = extradata[:]
> +            self.data = data
> +
> +    def findlines(self, data):
> +        if not data:
> +            return []
> +        pos = data.find("\n")
> +        positions = [0]
> +        while pos < len(data) - 1 and pos != -1:
> +            positions.append(pos + 1)
> +            pos = data.find("\n", pos + 1)
> +        return positions
> +
> +    def _get(self, index):
> +        # get the position encoded in pos:
> +        #   positive number is an index in 'data'
> +        #   negative number is in extrapieces
> +        pos = self.positions[index]
> +        if pos >= 0:
> +            return self.data, pos
> +        return self.extradata[-pos - 1], -1
> +
> +    def _getkey(self, pos):
> +        if pos >= 0:
> +            return self.data[pos:self.data.find('\x00', pos + 1)]
> +        return self.extradata[-pos - 1][0]
> +
> +    def bsearch(self, key):
> +        first = 0
> +        last = len(self.positions) - 1
> +
> +        while first <= last:
> +            midpoint = (first + last)//2
> +            nextpos = self.positions[midpoint]
> +            candidate = self._getkey(nextpos)
> +            r = _cmp(key, candidate)
> +            if r == 0:
> +                return midpoint
> +            else:
> +                if r < 0:
> +                    last = midpoint - 1
> +                else:
> +                    first = midpoint + 1
> +        return -1
> +
> +    def bsearch2(self, key):
> +        # same as the above, but will always return the position
> +        # done for performance reasons
> +        first = 0
> +        last = len(self.positions) - 1
> +
> +        while first <= last:
> +            midpoint = (first + last)//2
> +            nextpos = self.positions[midpoint]
> +            candidate = self._getkey(nextpos)
> +            r = _cmp(key, candidate)
> +            if r == 0:
> +                return (midpoint, True)
> +            else:
> +                if r < 0:
> +                    last = midpoint - 1
> +                else:
> +                    first = midpoint + 1
> +        return (first, False)
> +
> +    def __contains__(self, key):
> +        return self.bsearch(key) != -1
> +
> +    def _getflags(self, data, needle, pos):
> +        start = pos + 41
> +        end = data.find("\n", start)
> +        if end == -1:
> +            end = len(data) - 1
> +        if start == end:
> +            return ''
> +        return self.data[start:end]
> +
> +    def __getitem__(self, key):
> +        if not isinstance(key, str):
> +            raise TypeError("getitem: manifest keys must be a string.")
> +        needle = self.bsearch(key)
> +        if needle == -1:
> +            raise KeyError
> +        data, pos = self._get(needle)
> +        if pos == -1:
> +            return (data[1], data[2])
> +        zeropos = data.find('\x00', pos)
> +        assert 0 <= needle <= len(self.positions)
> +        assert len(self.extrainfo) == len(self.positions)
> +        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
> +        flags = self._getflags(data, needle, zeropos)
> +        return (hashval, flags)
> +
> +    def __delitem__(self, key):
> +        needle, found = self.bsearch2(key)
> +        if not found:
> +            raise KeyError
> +        cur = self.positions[needle]
> +        self.positions = self.positions[:needle] + self.positions[needle + 1:]
> +        self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
> +        if cur >= 0:
> +            self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
> +
> +    def __setitem__(self, key, value):
> +        if not isinstance(key, str):
> +            raise TypeError("setitem: manifest keys must be a string.")
> +        if not isinstance(value, tuple) or len(value) != 2:
> +            raise TypeError("Manifest values must be a tuple of (node, flags).")
> +        hashval = value[0]
> +        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
> +            raise TypeError("node must be a 20-byte string")
> +        flags = value[1]
> +        if not isinstance(flags, str) or len(flags) > 1:
> +            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
> +        needle, found = self.bsearch2(key)
> +        if found:
> +            # put the item
> +            pos = self.positions[needle]
> +            if pos < 0:
> +                self.extradata[-pos - 1] = (key, value[0], value[1])
> +            else:
> +                # just don't bother
> +                self.extradata.append((key, value[0], value[1]))
> +                self.positions[needle] = -len(self.extradata)
> +        else:
> +            # not found, put it in with extra positions
> +            self.extradata.append((key, value[0], value[1]))
> +            self.positions = (self.positions[:needle] + [-len(self.extradata)]
> +                              + self.positions[needle:])
> +            self.extrainfo = (self.extrainfo[:needle] + [0] +
> +                              self.extrainfo[needle:])
>
>      def copy(self):
> -        c = _lazymanifest('')
> -        c.update(self)
> -        return c
> +        # XXX call _compact like in C?
> +        return _lazymanifest(self.data, self.positions, self.extrainfo,
> +            self.extradata)
> +
> +    def _compact(self):
> +        # hopefully not called TOO often
> +        def findnextposition(positions, start, lgt):
> +            i = start
> +            while i < len(positions):
> +                if positions[i] >= 0:
> +                    return positions[i]
> +                i += 1
> +            return lgt
> +
> +        if len(self.extradata) == 0:
> +            return
> +        l = []
> +        last_cut = 0
> +        i = 0
> +        offset = 0
> +        self.extrainfo = [0] * len(self.positions)
> +        while i < len(self.positions):
> +            if self.positions[i] >= 0:
> +                cur = self.positions[i]
> +                last_cut = cur
> +                while True:
> +                    self.positions[i] = offset
> +                    i += 1
> +                    if i == len(self.positions) or self.positions[i] < 0:
> +                        break
> +                    offset += self.positions[i] - cur
> +                    cur = self.positions[i]
> +                end_cut = findnextposition(self.positions, i, len(self.data))
> +                offset += end_cut - cur
> +                l.append(self.data[last_cut:end_cut])
> +            else:
> +                while i < len(self.positions) and self.positions[i] < 0:
> +                    cur = self.positions[i]
> +                    t = self.extradata[-cur - 1]
> +                    l.append(self._pack(t))
> +                    self.positions[i] = offset
> +                    if len(t[1]) > 20:
> +                        self.extrainfo[i] = ord(t[1][21])
> +                    offset += len(l[-1])
> +                    i += 1
> +        self.data = ''.join(l)
> +        self.extradata = []
> +
> +    def _pack(self, d):
> +        return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
> +
> +    def text(self):
> +        self._compact()
> +        return self.data
>
>      def diff(self, m2, clean=False):
>          '''Finds changes between the current manifest and m2.'''
> +        # XXX think whether efficiency matters here
>          diff = {}
>
> -        for fn, e1 in self.iteritems():
> +        for fn, e1, flags in self.iterentries():
>              if fn not in m2:
> -                diff[fn] = e1, (None, '')
> +                diff[fn] = (e1, flags), (None, '')
>              else:
>                  e2 = m2[fn]
> -                if e1 != e2:
> -                    diff[fn] = e1, e2
> +                if (e1, flags) != e2:
> +                    diff[fn] = (e1, flags), e2
>                  elif clean:
>                      diff[fn] = None
>
> -        for fn, e2 in m2.iteritems():
> +        for fn, e2, flags in m2.iterentries():
>              if fn not in self:
> -                diff[fn] = (None, ''), e2
> +                diff[fn] = (None, ''), (e2, flags)
>
>          return diff
>
> +    def iterentries(self):
> +        return lazymanifestiterentries(self)
> +
> +    def iterkeys(self):
> +        return lazymanifestiter(self)
> +
> +    def __iter__(self):
> +        return lazymanifestiter(self)
> +
> +    def __len__(self):
> +        return len(self.positions)
> +
>      def filtercopy(self, filterfn):
> +        # XXX should be optimized
>          c = _lazymanifest('')
>          for f, n, fl in self.iterentries():
>              if filterfn(f):
>                  c[f] = n, fl
>          return c
>
> -    def text(self):
> -        """Get the full data of this manifest as a bytestring."""
> -        return _textv1(self.iterentries())
> -
>  try:
>      _lazymanifest = parsers.lazymanifest
>  except AttributeError:
> diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
> --- a/mercurial/pure/bdiff.py
> +++ b/mercurial/pure/bdiff.py
> @@ -111,8 +111,8 @@
>          def blocks(sa, sb):
>              a = ffi.new("struct bdiff_line**")
>              b = ffi.new("struct bdiff_line**")
> -            ac = ffi.new("char[]", sa)
> -            bc = ffi.new("char[]", sb)
> +            ac = ffi.new("char[]", str(sa))
> +            bc = ffi.new("char[]", str(sb))
>              l = ffi.new("struct bdiff_hunk*")
>              try:
>                  an = lib.bdiff_splitlines(ac, len(sa), a)
> @@ -138,8 +138,8 @@
>          def bdiff(sa, sb):
>              a = ffi.new("struct bdiff_line**")
>              b = ffi.new("struct bdiff_line**")
> -            ac = ffi.new("char[]", sa)
> -            bc = ffi.new("char[]", sb)
> +            ac = ffi.new("char[]", str(sa))
> +            bc = ffi.new("char[]", str(sb))
>              l = ffi.new("struct bdiff_hunk*")
>              try:
>                  an = lib.bdiff_splitlines(ac, len(sa), a)
Augie Fackler - Sept. 19, 2016, 11:43 p.m.
> On Sep 17, 2016, at 14:23, Maciej Fijalkowski <fijall@gmail.com> wrote:
> 
> This should fix the issues presented.
> 
> There is one problem which is that the hash in test-rebase-detach
> changes. The way I see it is just an RNG phase-shift, but it might be
> a real bug. Some actual input will be appreciated.

I'll get a look at this in the morning America/New_York - my other miscellany occupied me for far too long today and I just don't have brainpower for this tonight. Sorry to keep you waiting.

AF

> 
> 
> 
> On Sat, Sep 17, 2016 at 8:22 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
>> # HG changeset patch
>> # User Maciej Fijalkowski <fijall@gmail.com>
>> # Date 1473680234 -7200
>> #      Mon Sep 12 13:37:14 2016 +0200
>> # Node ID 7551f1e60b2155462d89a9571eec065e9f67debc
>> # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
>> lazymanifest: write a more efficient, pypy friendly version of lazymanifest
>> 
>> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
>> --- a/mercurial/manifest.py
>> +++ b/mercurial/manifest.py
>> @@ -104,69 +104,297 @@
>>     _checkforbidden(files)
>>     return ''.join(lines)
>> 
>> -class _lazymanifest(dict):
>> -    """This is the pure implementation of lazymanifest.
>> -
>> -    It has not been optimized *at all* and is not lazy.
>> -    """
>> -
>> -    def __init__(self, data):
>> -        dict.__init__(self)
>> -        for f, n, fl in _parse(data):
>> -            self[f] = n, fl
>> -
>> -    def __setitem__(self, k, v):
>> -        node, flag = v
>> -        assert node is not None
>> -        if len(node) > 21:
>> -            node = node[:21] # match c implementation behavior
>> -        dict.__setitem__(self, k, (node, flag))
>> +class lazymanifestiter(object):
>> +    def __init__(self, lm):
>> +        self.pos = 0
>> +        self.lm = lm
>> 
>>     def __iter__(self):
>> -        return iter(sorted(dict.keys(self)))
>> +        return self
>> 
>> -    def iterkeys(self):
>> -        return iter(sorted(dict.keys(self)))
>> +    def next(self):
>> +        try:
>> +            data, pos = self.lm._get(self.pos)
>> +        except IndexError:
>> +            raise StopIteration
>> +        if pos == -1:
>> +            self.pos += 1
>> +            return data[0]
>> +        self.pos += 1
>> +        zeropos = data.find('\x00', pos)
>> +        return data[pos:zeropos]
>> 
>> -    def iterentries(self):
>> -        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
>> +class lazymanifestiterentries(object):
>> +    def __init__(self, lm):
>> +        self.lm = lm
>> +        self.pos = 0
>> +
>> +    def __iter__(self):
>> +        return self
>> +
>> +    def next(self):
>> +        try:
>> +            data, pos = self.lm._get(self.pos)
>> +        except IndexError:
>> +            raise StopIteration
>> +        if pos == -1:
>> +            self.pos += 1
>> +            return data
>> +        zeropos = data.find('\x00', pos)
>> +        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
>> +                            zeropos + 1, 40)
>> +        flags = self.lm._getflags(data, self.pos, zeropos)
>> +        self.pos += 1
>> +        return (data[pos:zeropos], hashval, flags)
>> +
>> +def unhexlify(data, extra, pos, length):
>> +    s = data[pos:pos + length].decode('hex')
>> +    if extra:
>> +        s += chr(extra & 0xff)
>> +    return s
>> +
>> +def _cmp(a, b):
>> +    return (a > b) - (a < b)
>> +
>> +class _lazymanifest(object):
>> +    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
>> +        if positions is None:
>> +            self.positions = self.findlines(data)
>> +            self.extrainfo = [0] * len(self.positions)
>> +            self.data = data
>> +            self.extradata = []
>> +        else:
>> +            self.positions = positions[:]
>> +            self.extrainfo = extrainfo[:]
>> +            self.extradata = extradata[:]
>> +            self.data = data
>> +
>> +    def findlines(self, data):
>> +        if not data:
>> +            return []
>> +        pos = data.find("\n")
>> +        positions = [0]
>> +        while pos < len(data) - 1 and pos != -1:
>> +            positions.append(pos + 1)
>> +            pos = data.find("\n", pos + 1)
>> +        return positions
>> +
>> +    def _get(self, index):
>> +        # get the position encoded in pos:
>> +        #   positive number is an index in 'data'
>> +        #   negative number is in extrapieces
>> +        pos = self.positions[index]
>> +        if pos >= 0:
>> +            return self.data, pos
>> +        return self.extradata[-pos - 1], -1
>> +
>> +    def _getkey(self, pos):
>> +        if pos >= 0:
>> +            return self.data[pos:self.data.find('\x00', pos + 1)]
>> +        return self.extradata[-pos - 1][0]
>> +
>> +    def bsearch(self, key):
>> +        first = 0
>> +        last = len(self.positions) - 1
>> +
>> +        while first <= last:
>> +            midpoint = (first + last)//2
>> +            nextpos = self.positions[midpoint]
>> +            candidate = self._getkey(nextpos)
>> +            r = _cmp(key, candidate)
>> +            if r == 0:
>> +                return midpoint
>> +            else:
>> +                if r < 0:
>> +                    last = midpoint - 1
>> +                else:
>> +                    first = midpoint + 1
>> +        return -1
>> +
>> +    def bsearch2(self, key):
>> +        # same as the above, but will always return the position
>> +        # done for performance reasons
>> +        first = 0
>> +        last = len(self.positions) - 1
>> +
>> +        while first <= last:
>> +            midpoint = (first + last)//2
>> +            nextpos = self.positions[midpoint]
>> +            candidate = self._getkey(nextpos)
>> +            r = _cmp(key, candidate)
>> +            if r == 0:
>> +                return (midpoint, True)
>> +            else:
>> +                if r < 0:
>> +                    last = midpoint - 1
>> +                else:
>> +                    first = midpoint + 1
>> +        return (first, False)
>> +
>> +    def __contains__(self, key):
>> +        return self.bsearch(key) != -1
>> +
>> +    def _getflags(self, data, needle, pos):
>> +        start = pos + 41
>> +        end = data.find("\n", start)
>> +        if end == -1:
>> +            end = len(data) - 1
>> +        if start == end:
>> +            return ''
>> +        return self.data[start:end]
>> +
>> +    def __getitem__(self, key):
>> +        if not isinstance(key, str):
>> +            raise TypeError("getitem: manifest keys must be a string.")
>> +        needle = self.bsearch(key)
>> +        if needle == -1:
>> +            raise KeyError
>> +        data, pos = self._get(needle)
>> +        if pos == -1:
>> +            return (data[1], data[2])
>> +        zeropos = data.find('\x00', pos)
>> +        assert 0 <= needle <= len(self.positions)
>> +        assert len(self.extrainfo) == len(self.positions)
>> +        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
>> +        flags = self._getflags(data, needle, zeropos)
>> +        return (hashval, flags)
>> +
>> +    def __delitem__(self, key):
>> +        needle, found = self.bsearch2(key)
>> +        if not found:
>> +            raise KeyError
>> +        cur = self.positions[needle]
>> +        self.positions = self.positions[:needle] + self.positions[needle + 1:]
>> +        self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
>> +        if cur >= 0:
>> +            self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
>> +
>> +    def __setitem__(self, key, value):
>> +        if not isinstance(key, str):
>> +            raise TypeError("setitem: manifest keys must be a string.")
>> +        if not isinstance(value, tuple) or len(value) != 2:
>> +            raise TypeError("Manifest values must be a tuple of (node, flags).")
>> +        hashval = value[0]
>> +        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
>> +            raise TypeError("node must be a 20-byte string")
>> +        flags = value[1]
>> +        if not isinstance(flags, str) or len(flags) > 1:
>> +            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
>> +        needle, found = self.bsearch2(key)
>> +        if found:
>> +            # put the item
>> +            pos = self.positions[needle]
>> +            if pos < 0:
>> +                self.extradata[-pos - 1] = (key, value[0], value[1])
>> +            else:
>> +                # just don't bother
>> +                self.extradata.append((key, value[0], value[1]))
>> +                self.positions[needle] = -len(self.extradata)
>> +        else:
>> +            # not found, put it in with extra positions
>> +            self.extradata.append((key, value[0], value[1]))
>> +            self.positions = (self.positions[:needle] + [-len(self.extradata)]
>> +                              + self.positions[needle:])
>> +            self.extrainfo = (self.extrainfo[:needle] + [0] +
>> +                              self.extrainfo[needle:])
>> 
>>     def copy(self):
>> -        c = _lazymanifest('')
>> -        c.update(self)
>> -        return c
>> +        # XXX call _compact like in C?
>> +        return _lazymanifest(self.data, self.positions, self.extrainfo,
>> +            self.extradata)
>> +
>> +    def _compact(self):
>> +        # hopefully not called TOO often
>> +        def findnextposition(positions, start, lgt):
>> +            i = start
>> +            while i < len(positions):
>> +                if positions[i] >= 0:
>> +                    return positions[i]
>> +                i += 1
>> +            return lgt
>> +
>> +        if len(self.extradata) == 0:
>> +            return
>> +        l = []
>> +        last_cut = 0
>> +        i = 0
>> +        offset = 0
>> +        self.extrainfo = [0] * len(self.positions)
>> +        while i < len(self.positions):
>> +            if self.positions[i] >= 0:
>> +                cur = self.positions[i]
>> +                last_cut = cur
>> +                while True:
>> +                    self.positions[i] = offset
>> +                    i += 1
>> +                    if i == len(self.positions) or self.positions[i] < 0:
>> +                        break
>> +                    offset += self.positions[i] - cur
>> +                    cur = self.positions[i]
>> +                end_cut = findnextposition(self.positions, i, len(self.data))
>> +                offset += end_cut - cur
>> +                l.append(self.data[last_cut:end_cut])
>> +            else:
>> +                while i < len(self.positions) and self.positions[i] < 0:
>> +                    cur = self.positions[i]
>> +                    t = self.extradata[-cur - 1]
>> +                    l.append(self._pack(t))
>> +                    self.positions[i] = offset
>> +                    if len(t[1]) > 20:
>> +                        self.extrainfo[i] = ord(t[1][21])
>> +                    offset += len(l[-1])
>> +                    i += 1
>> +        self.data = ''.join(l)
>> +        self.extradata = []
>> +
>> +    def _pack(self, d):
>> +        return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
>> +
>> +    def text(self):
>> +        self._compact()
>> +        return self.data
>> 
>>     def diff(self, m2, clean=False):
>>         '''Finds changes between the current manifest and m2.'''
>> +        # XXX think whether efficiency matters here
>>         diff = {}
>> 
>> -        for fn, e1 in self.iteritems():
>> +        for fn, e1, flags in self.iterentries():
>>             if fn not in m2:
>> -                diff[fn] = e1, (None, '')
>> +                diff[fn] = (e1, flags), (None, '')
>>             else:
>>                 e2 = m2[fn]
>> -                if e1 != e2:
>> -                    diff[fn] = e1, e2
>> +                if (e1, flags) != e2:
>> +                    diff[fn] = (e1, flags), e2
>>                 elif clean:
>>                     diff[fn] = None
>> 
>> -        for fn, e2 in m2.iteritems():
>> +        for fn, e2, flags in m2.iterentries():
>>             if fn not in self:
>> -                diff[fn] = (None, ''), e2
>> +                diff[fn] = (None, ''), (e2, flags)
>> 
>>         return diff
>> 
>> +    def iterentries(self):
>> +        return lazymanifestiterentries(self)
>> +
>> +    def iterkeys(self):
>> +        return lazymanifestiter(self)
>> +
>> +    def __iter__(self):
>> +        return lazymanifestiter(self)
>> +
>> +    def __len__(self):
>> +        return len(self.positions)
>> +
>>     def filtercopy(self, filterfn):
>> +        # XXX should be optimized
>>         c = _lazymanifest('')
>>         for f, n, fl in self.iterentries():
>>             if filterfn(f):
>>                 c[f] = n, fl
>>         return c
>> 
>> -    def text(self):
>> -        """Get the full data of this manifest as a bytestring."""
>> -        return _textv1(self.iterentries())
>> -
>> try:
>>     _lazymanifest = parsers.lazymanifest
>> except AttributeError:
>> diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
>> --- a/mercurial/pure/bdiff.py
>> +++ b/mercurial/pure/bdiff.py
>> @@ -111,8 +111,8 @@
>>         def blocks(sa, sb):
>>             a = ffi.new("struct bdiff_line**")
>>             b = ffi.new("struct bdiff_line**")
>> -            ac = ffi.new("char[]", sa)
>> -            bc = ffi.new("char[]", sb)
>> +            ac = ffi.new("char[]", str(sa))
>> +            bc = ffi.new("char[]", str(sb))
>>             l = ffi.new("struct bdiff_hunk*")
>>             try:
>>                 an = lib.bdiff_splitlines(ac, len(sa), a)
>> @@ -138,8 +138,8 @@
>>         def bdiff(sa, sb):
>>             a = ffi.new("struct bdiff_line**")
>>             b = ffi.new("struct bdiff_line**")
>> -            ac = ffi.new("char[]", sa)
>> -            bc = ffi.new("char[]", sb)
>> +            ac = ffi.new("char[]", str(sa))
>> +            bc = ffi.new("char[]", str(sb))
>>             l = ffi.new("struct bdiff_hunk*")
>>             try:
>>                 an = lib.bdiff_splitlines(ac, len(sa), a)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Augie Fackler - Sept. 20, 2016, 2:39 p.m.
On Sat, Sep 17, 2016 at 08:23:36PM +0200, Maciej Fijalkowski wrote:
> This should fix the issues presented.
>
> There is one problem which is that the hash in test-rebase-detach
> changes. The way I see it is just an RNG phase-shift, but it might be
> a real bug. Some actual input will be appreciated.

It's not possibly RNG related - there's no RNG that goes into the sha1
of a node. I've added --debug to the failure in test-rebase-detach and
compared them between cpython and pypy:

cpython:
@@ -366,11 +366,24 @@


   $ hg log --rev tip --debug
-  changeset:   8:9472f4b1d736
+  changeset:   8:9472f4b1d7366902d0098aa1401e3f170000cc56
   tag:         tip
+  phase:       draft
+  parent:      7:02de42196ebee42ef284b6780a87cdc96e8eaab6
+  parent:      -1:0000000000000000000000000000000000000000
+  manifest:    8:8dbdcf066a97239fde2d0dba19fcb1e699fa66d2
   user:        test
   date:        Thu Jan 01 00:00:00 1970 +0000
-  summary:     Collapsed revision
+  files:       F
+  files+:      E
+  extra:       branch=default
+  extra:       rebase_source=9427d4d5af81c393e6b630129712cde8cdad5605
+  description:
+  Collapsed revision
+  * I
+  * Merge
+  * J
+


pypy:
@@ -366,11 +366,24 @@


   $ hg log --rev tip --debug
-  changeset:   8:9472f4b1d736
+  changeset:   8:122ceff3b303b13e5ca323742a8ebe589b9f8066
   tag:         tip
+  phase:       draft
+  parent:      7:02de42196ebee42ef284b6780a87cdc96e8eaab6
+  parent:      -1:0000000000000000000000000000000000000000
+  manifest:    8:09f1dc369d2667dcaeb9cf828b0fcb76bda29f33
   user:        test
   date:        Thu Jan 01 00:00:00 1970 +0000
-  summary:     Collapsed revision
+  files:       F
+  files+:      E
+  extra:       branch=default
+  extra:       rebase_source=9427d4d5af81c393e6b630129712cde8cdad5605
+  description:
+  Collapsed revision
+  * I
+  * Merge
+  * J
+

Which means the manifest is hashing out differently. That suggests
there's a subtle bug in how the manifest is hashing out in the pypy
version. Adding a `hg debugdata -m 8` (which prints raw manifest
data), and this is what I get on cpython:

   $ hg debugdata -m 8
+  A\x0045f17b21388f07b8939b22052e5f3776e5246388 (esc)
+  E\x00c3b9643004a8d1c7f39b0025cefab53dc8f7dc12 (esc)
+  F\x00293a00dc38ac20f042c67ba8534eedc1b6a7ae15 (esc)
+  H\x008500189e74a9e0475e822093bc7db0d631aeb0b4 (esc)

on pypy:
   $ hg debugdata -m 8
+  A\x0045f17b21388f07b8939b22052e5f3776e5246388 (esc)
+  F\x0022bfcfd62a21a3287edbd4d656218d0f525ed76a (esc)
+  E\x00c3b9643004a8d1c7f39b0025cefab53dc8f7dc12 (esc)
+  F\x00293a00dc38ac20f042c67ba8534eedc1b6a7ae15 (esc)
+  H\x008500189e74a9e0475e822093bc7db0d631aeb0b4 (esc)


So you've got a subtle bug here where the compaction of the
lazymanifest isn't working right - probably around merging manifests
somehow. Does this give you enough to get started?

(I'd probably start debugging this by littering _lazymanifest._compact
and _lazymanifest.text with asserts until I managed to trip over the
un-sorted lines, and then backtrack from there to figure out how they
managed to survive.)

>
>
>
> On Sat, Sep 17, 2016 at 8:22 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
> > # HG changeset patch
> > # User Maciej Fijalkowski <fijall@gmail.com>
> > # Date 1473680234 -7200
> > #      Mon Sep 12 13:37:14 2016 +0200
> > # Node ID 7551f1e60b2155462d89a9571eec065e9f67debc
> > # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
> > lazymanifest: write a more efficient, pypy friendly version of lazymanifest
> >
> > diff --git a/mercurial/manifest.py b/mercurial/manifest.py
> > --- a/mercurial/manifest.py
> > +++ b/mercurial/manifest.py
> > @@ -104,69 +104,297 @@
> >      _checkforbidden(files)
> >      return ''.join(lines)
> >
> > -class _lazymanifest(dict):
> > -    """This is the pure implementation of lazymanifest.
> > -
> > -    It has not been optimized *at all* and is not lazy.
> > -    """
> > -
> > -    def __init__(self, data):
> > -        dict.__init__(self)
> > -        for f, n, fl in _parse(data):
> > -            self[f] = n, fl
> > -
> > -    def __setitem__(self, k, v):
> > -        node, flag = v
> > -        assert node is not None
> > -        if len(node) > 21:
> > -            node = node[:21] # match c implementation behavior
> > -        dict.__setitem__(self, k, (node, flag))
> > +class lazymanifestiter(object):
> > +    def __init__(self, lm):
> > +        self.pos = 0
> > +        self.lm = lm
> >
> >      def __iter__(self):
> > -        return iter(sorted(dict.keys(self)))
> > +        return self
> >
> > -    def iterkeys(self):
> > -        return iter(sorted(dict.keys(self)))
> > +    def next(self):
> > +        try:
> > +            data, pos = self.lm._get(self.pos)
> > +        except IndexError:
> > +            raise StopIteration
> > +        if pos == -1:
> > +            self.pos += 1
> > +            return data[0]
> > +        self.pos += 1
> > +        zeropos = data.find('\x00', pos)
> > +        return data[pos:zeropos]
> >
> > -    def iterentries(self):
> > -        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
> > +class lazymanifestiterentries(object):
> > +    def __init__(self, lm):
> > +        self.lm = lm
> > +        self.pos = 0
> > +
> > +    def __iter__(self):
> > +        return self
> > +
> > +    def next(self):
> > +        try:
> > +            data, pos = self.lm._get(self.pos)
> > +        except IndexError:
> > +            raise StopIteration
> > +        if pos == -1:
> > +            self.pos += 1
> > +            return data
> > +        zeropos = data.find('\x00', pos)
> > +        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
> > +                            zeropos + 1, 40)
> > +        flags = self.lm._getflags(data, self.pos, zeropos)
> > +        self.pos += 1
> > +        return (data[pos:zeropos], hashval, flags)
> > +
> > +def unhexlify(data, extra, pos, length):
> > +    s = data[pos:pos + length].decode('hex')
> > +    if extra:
> > +        s += chr(extra & 0xff)
> > +    return s
> > +
> > +def _cmp(a, b):
> > +    return (a > b) - (a < b)
> > +
> > +class _lazymanifest(object):
> > +    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
> > +        if positions is None:
> > +            self.positions = self.findlines(data)
> > +            self.extrainfo = [0] * len(self.positions)
> > +            self.data = data
> > +            self.extradata = []
> > +        else:
> > +            self.positions = positions[:]
> > +            self.extrainfo = extrainfo[:]
> > +            self.extradata = extradata[:]
> > +            self.data = data
> > +
> > +    def findlines(self, data):
> > +        if not data:
> > +            return []
> > +        pos = data.find("\n")
> > +        positions = [0]
> > +        while pos < len(data) - 1 and pos != -1:
> > +            positions.append(pos + 1)
> > +            pos = data.find("\n", pos + 1)
> > +        return positions
> > +
> > +    def _get(self, index):
> > +        # get the position encoded in pos:
> > +        #   positive number is an index in 'data'
> > +        #   negative number is in extrapieces
> > +        pos = self.positions[index]
> > +        if pos >= 0:
> > +            return self.data, pos
> > +        return self.extradata[-pos - 1], -1
> > +
> > +    def _getkey(self, pos):
> > +        if pos >= 0:
> > +            return self.data[pos:self.data.find('\x00', pos + 1)]
> > +        return self.extradata[-pos - 1][0]
> > +
> > +    def bsearch(self, key):
> > +        first = 0
> > +        last = len(self.positions) - 1
> > +
> > +        while first <= last:
> > +            midpoint = (first + last)//2
> > +            nextpos = self.positions[midpoint]
> > +            candidate = self._getkey(nextpos)
> > +            r = _cmp(key, candidate)
> > +            if r == 0:
> > +                return midpoint
> > +            else:
> > +                if r < 0:
> > +                    last = midpoint - 1
> > +                else:
> > +                    first = midpoint + 1
> > +        return -1
> > +
> > +    def bsearch2(self, key):
> > +        # same as the above, but will always return the position
> > +        # done for performance reasons
> > +        first = 0
> > +        last = len(self.positions) - 1
> > +
> > +        while first <= last:
> > +            midpoint = (first + last)//2
> > +            nextpos = self.positions[midpoint]
> > +            candidate = self._getkey(nextpos)
> > +            r = _cmp(key, candidate)
> > +            if r == 0:
> > +                return (midpoint, True)
> > +            else:
> > +                if r < 0:
> > +                    last = midpoint - 1
> > +                else:
> > +                    first = midpoint + 1
> > +        return (first, False)
> > +
> > +    def __contains__(self, key):
> > +        return self.bsearch(key) != -1
> > +
> > +    def _getflags(self, data, needle, pos):
> > +        start = pos + 41
> > +        end = data.find("\n", start)
> > +        if end == -1:
> > +            end = len(data) - 1
> > +        if start == end:
> > +            return ''
> > +        return self.data[start:end]
> > +
> > +    def __getitem__(self, key):
> > +        if not isinstance(key, str):
> > +            raise TypeError("getitem: manifest keys must be a string.")
> > +        needle = self.bsearch(key)
> > +        if needle == -1:
> > +            raise KeyError
> > +        data, pos = self._get(needle)
> > +        if pos == -1:
> > +            return (data[1], data[2])
> > +        zeropos = data.find('\x00', pos)
> > +        assert 0 <= needle <= len(self.positions)
> > +        assert len(self.extrainfo) == len(self.positions)
> > +        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
> > +        flags = self._getflags(data, needle, zeropos)
> > +        return (hashval, flags)
> > +
> > +    def __delitem__(self, key):
> > +        needle, found = self.bsearch2(key)
> > +        if not found:
> > +            raise KeyError
> > +        cur = self.positions[needle]
> > +        self.positions = self.positions[:needle] + self.positions[needle + 1:]
> > +        self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
> > +        if cur >= 0:
> > +            self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
> > +
> > +    def __setitem__(self, key, value):
> > +        if not isinstance(key, str):
> > +            raise TypeError("setitem: manifest keys must be a string.")
> > +        if not isinstance(value, tuple) or len(value) != 2:
> > +            raise TypeError("Manifest values must be a tuple of (node, flags).")
> > +        hashval = value[0]
> > +        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
> > +            raise TypeError("node must be a 20-byte string")
> > +        flags = value[1]
> > +        if not isinstance(flags, str) or len(flags) > 1:
> > +            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
> > +        needle, found = self.bsearch2(key)
> > +        if found:
> > +            # put the item
> > +            pos = self.positions[needle]
> > +            if pos < 0:
> > +                self.extradata[-pos - 1] = (key, value[0], value[1])
> > +            else:
> > +                # just don't bother
> > +                self.extradata.append((key, value[0], value[1]))
> > +                self.positions[needle] = -len(self.extradata)
> > +        else:
> > +            # not found, put it in with extra positions
> > +            self.extradata.append((key, value[0], value[1]))
> > +            self.positions = (self.positions[:needle] + [-len(self.extradata)]
> > +                              + self.positions[needle:])
> > +            self.extrainfo = (self.extrainfo[:needle] + [0] +
> > +                              self.extrainfo[needle:])
> >
> >      def copy(self):
> > -        c = _lazymanifest('')
> > -        c.update(self)
> > -        return c
> > +        # XXX call _compact like in C?
> > +        return _lazymanifest(self.data, self.positions, self.extrainfo,
> > +            self.extradata)
> > +
> > +    def _compact(self):
> > +        # hopefully not called TOO often
> > +        def findnextposition(positions, start, lgt):
> > +            i = start
> > +            while i < len(positions):
> > +                if positions[i] >= 0:
> > +                    return positions[i]
> > +                i += 1
> > +            return lgt
> > +
> > +        if len(self.extradata) == 0:
> > +            return
> > +        l = []
> > +        last_cut = 0
> > +        i = 0
> > +        offset = 0
> > +        self.extrainfo = [0] * len(self.positions)
> > +        while i < len(self.positions):
> > +            if self.positions[i] >= 0:
> > +                cur = self.positions[i]
> > +                last_cut = cur
> > +                while True:
> > +                    self.positions[i] = offset
> > +                    i += 1
> > +                    if i == len(self.positions) or self.positions[i] < 0:
> > +                        break
> > +                    offset += self.positions[i] - cur
> > +                    cur = self.positions[i]
> > +                end_cut = findnextposition(self.positions, i, len(self.data))
> > +                offset += end_cut - cur
> > +                l.append(self.data[last_cut:end_cut])
> > +            else:
> > +                while i < len(self.positions) and self.positions[i] < 0:
> > +                    cur = self.positions[i]
> > +                    t = self.extradata[-cur - 1]
> > +                    l.append(self._pack(t))
> > +                    self.positions[i] = offset
> > +                    if len(t[1]) > 20:
> > +                        self.extrainfo[i] = ord(t[1][21])
> > +                    offset += len(l[-1])
> > +                    i += 1
> > +        self.data = ''.join(l)
> > +        self.extradata = []
> > +
> > +    def _pack(self, d):
> > +        return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
> > +
> > +    def text(self):
> > +        self._compact()
> > +        return self.data
> >
> >      def diff(self, m2, clean=False):
> >          '''Finds changes between the current manifest and m2.'''
> > +        # XXX think whether efficiency matters here
> >          diff = {}
> >
> > -        for fn, e1 in self.iteritems():
> > +        for fn, e1, flags in self.iterentries():
> >              if fn not in m2:
> > -                diff[fn] = e1, (None, '')
> > +                diff[fn] = (e1, flags), (None, '')
> >              else:
> >                  e2 = m2[fn]
> > -                if e1 != e2:
> > -                    diff[fn] = e1, e2
> > +                if (e1, flags) != e2:
> > +                    diff[fn] = (e1, flags), e2
> >                  elif clean:
> >                      diff[fn] = None
> >
> > -        for fn, e2 in m2.iteritems():
> > +        for fn, e2, flags in m2.iterentries():
> >              if fn not in self:
> > -                diff[fn] = (None, ''), e2
> > +                diff[fn] = (None, ''), (e2, flags)
> >
> >          return diff
> >
> > +    def iterentries(self):
> > +        return lazymanifestiterentries(self)
> > +
> > +    def iterkeys(self):
> > +        return lazymanifestiter(self)
> > +
> > +    def __iter__(self):
> > +        return lazymanifestiter(self)
> > +
> > +    def __len__(self):
> > +        return len(self.positions)
> > +
> >      def filtercopy(self, filterfn):
> > +        # XXX should be optimized
> >          c = _lazymanifest('')
> >          for f, n, fl in self.iterentries():
> >              if filterfn(f):
> >                  c[f] = n, fl
> >          return c
> >
> > -    def text(self):
> > -        """Get the full data of this manifest as a bytestring."""
> > -        return _textv1(self.iterentries())
> > -
> >  try:
> >      _lazymanifest = parsers.lazymanifest
> >  except AttributeError:
> > diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
> > --- a/mercurial/pure/bdiff.py
> > +++ b/mercurial/pure/bdiff.py
> > @@ -111,8 +111,8 @@
> >          def blocks(sa, sb):
> >              a = ffi.new("struct bdiff_line**")
> >              b = ffi.new("struct bdiff_line**")
> > -            ac = ffi.new("char[]", sa)
> > -            bc = ffi.new("char[]", sb)
> > +            ac = ffi.new("char[]", str(sa))
> > +            bc = ffi.new("char[]", str(sb))
> >              l = ffi.new("struct bdiff_hunk*")
> >              try:
> >                  an = lib.bdiff_splitlines(ac, len(sa), a)
> > @@ -138,8 +138,8 @@
> >          def bdiff(sa, sb):
> >              a = ffi.new("struct bdiff_line**")
> >              b = ffi.new("struct bdiff_line**")
> > -            ac = ffi.new("char[]", sa)
> > -            bc = ffi.new("char[]", sb)
> > +            ac = ffi.new("char[]", str(sa))
> > +            bc = ffi.new("char[]", str(sb))
> >              l = ffi.new("struct bdiff_hunk*")
> >              try:
> >                  an = lib.bdiff_splitlines(ac, len(sa), a)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Maciej Fijalkowski - Sept. 25, 2016, 7:13 a.m.
I must say (and you are going to say that those are idle complaints)
that asserts and prints are far inferior than a proper debugger which
is surprisingly hard to use in the case of mercurial tests

On Tue, Sep 20, 2016 at 4:39 PM, Augie Fackler <raf@durin42.com> wrote:
> On Sat, Sep 17, 2016 at 08:23:36PM +0200, Maciej Fijalkowski wrote:
>> This should fix the issues presented.
>>
>> There is one problem which is that the hash in test-rebase-detach
>> changes. The way I see it is just an RNG phase-shift, but it might be
>> a real bug. Some actual input will be appreciated.
>
> It's not possibly RNG related - there's no RNG that goes into the sha1
> of a node. I've added --debug to the failure in test-rebase-detach and
> compared them between cpython and pypy:
>
> cpython:
> @@ -366,11 +366,24 @@
>
>
>    $ hg log --rev tip --debug
> -  changeset:   8:9472f4b1d736
> +  changeset:   8:9472f4b1d7366902d0098aa1401e3f170000cc56
>    tag:         tip
> +  phase:       draft
> +  parent:      7:02de42196ebee42ef284b6780a87cdc96e8eaab6
> +  parent:      -1:0000000000000000000000000000000000000000
> +  manifest:    8:8dbdcf066a97239fde2d0dba19fcb1e699fa66d2
>    user:        test
>    date:        Thu Jan 01 00:00:00 1970 +0000
> -  summary:     Collapsed revision
> +  files:       F
> +  files+:      E
> +  extra:       branch=default
> +  extra:       rebase_source=9427d4d5af81c393e6b630129712cde8cdad5605
> +  description:
> +  Collapsed revision
> +  * I
> +  * Merge
> +  * J
> +
>
>
> pypy:
> @@ -366,11 +366,24 @@
>
>
>    $ hg log --rev tip --debug
> -  changeset:   8:9472f4b1d736
> +  changeset:   8:122ceff3b303b13e5ca323742a8ebe589b9f8066
>    tag:         tip
> +  phase:       draft
> +  parent:      7:02de42196ebee42ef284b6780a87cdc96e8eaab6
> +  parent:      -1:0000000000000000000000000000000000000000
> +  manifest:    8:09f1dc369d2667dcaeb9cf828b0fcb76bda29f33
>    user:        test
>    date:        Thu Jan 01 00:00:00 1970 +0000
> -  summary:     Collapsed revision
> +  files:       F
> +  files+:      E
> +  extra:       branch=default
> +  extra:       rebase_source=9427d4d5af81c393e6b630129712cde8cdad5605
> +  description:
> +  Collapsed revision
> +  * I
> +  * Merge
> +  * J
> +
>
> Which means the manifest is hashing out differently. That suggests
> there's a subtle bug in how the manifest is hashing out in the pypy
> version. Adding a `hg debugdata -m 8` (which prints raw manifest
> data), and this is what I get on cpython:
>
>    $ hg debugdata -m 8
> +  A\x0045f17b21388f07b8939b22052e5f3776e5246388 (esc)
> +  E\x00c3b9643004a8d1c7f39b0025cefab53dc8f7dc12 (esc)
> +  F\x00293a00dc38ac20f042c67ba8534eedc1b6a7ae15 (esc)
> +  H\x008500189e74a9e0475e822093bc7db0d631aeb0b4 (esc)
>
> on pypy:
>    $ hg debugdata -m 8
> +  A\x0045f17b21388f07b8939b22052e5f3776e5246388 (esc)
> +  F\x0022bfcfd62a21a3287edbd4d656218d0f525ed76a (esc)
> +  E\x00c3b9643004a8d1c7f39b0025cefab53dc8f7dc12 (esc)
> +  F\x00293a00dc38ac20f042c67ba8534eedc1b6a7ae15 (esc)
> +  H\x008500189e74a9e0475e822093bc7db0d631aeb0b4 (esc)
>
>
> So you've got a subtle bug here where the compaction of the
> lazymanifest isn't working right - probably around merging manifests
> somehow. Does this give you enough to get started?
>
> (I'd probably start debugging this by littering _lazymanifest._compact
> and _lazymanifest.text with asserts until I managed to trip over the
> un-sorted lines, and then backtrack from there to figure out how they
> managed to survive.)
>
>>
>>
>>
>> On Sat, Sep 17, 2016 at 8:22 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
>> > # HG changeset patch
>> > # User Maciej Fijalkowski <fijall@gmail.com>
>> > # Date 1473680234 -7200
>> > #      Mon Sep 12 13:37:14 2016 +0200
>> > # Node ID 7551f1e60b2155462d89a9571eec065e9f67debc
>> > # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
>> > lazymanifest: write a more efficient, pypy friendly version of lazymanifest
>> >
>> > diff --git a/mercurial/manifest.py b/mercurial/manifest.py
>> > --- a/mercurial/manifest.py
>> > +++ b/mercurial/manifest.py
>> > @@ -104,69 +104,297 @@
>> >      _checkforbidden(files)
>> >      return ''.join(lines)
>> >
>> > -class _lazymanifest(dict):
>> > -    """This is the pure implementation of lazymanifest.
>> > -
>> > -    It has not been optimized *at all* and is not lazy.
>> > -    """
>> > -
>> > -    def __init__(self, data):
>> > -        dict.__init__(self)
>> > -        for f, n, fl in _parse(data):
>> > -            self[f] = n, fl
>> > -
>> > -    def __setitem__(self, k, v):
>> > -        node, flag = v
>> > -        assert node is not None
>> > -        if len(node) > 21:
>> > -            node = node[:21] # match c implementation behavior
>> > -        dict.__setitem__(self, k, (node, flag))
>> > +class lazymanifestiter(object):
>> > +    def __init__(self, lm):
>> > +        self.pos = 0
>> > +        self.lm = lm
>> >
>> >      def __iter__(self):
>> > -        return iter(sorted(dict.keys(self)))
>> > +        return self
>> >
>> > -    def iterkeys(self):
>> > -        return iter(sorted(dict.keys(self)))
>> > +    def next(self):
>> > +        try:
>> > +            data, pos = self.lm._get(self.pos)
>> > +        except IndexError:
>> > +            raise StopIteration
>> > +        if pos == -1:
>> > +            self.pos += 1
>> > +            return data[0]
>> > +        self.pos += 1
>> > +        zeropos = data.find('\x00', pos)
>> > +        return data[pos:zeropos]
>> >
>> > -    def iterentries(self):
>> > -        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
>> > +class lazymanifestiterentries(object):
>> > +    def __init__(self, lm):
>> > +        self.lm = lm
>> > +        self.pos = 0
>> > +
>> > +    def __iter__(self):
>> > +        return self
>> > +
>> > +    def next(self):
>> > +        try:
>> > +            data, pos = self.lm._get(self.pos)
>> > +        except IndexError:
>> > +            raise StopIteration
>> > +        if pos == -1:
>> > +            self.pos += 1
>> > +            return data
>> > +        zeropos = data.find('\x00', pos)
>> > +        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
>> > +                            zeropos + 1, 40)
>> > +        flags = self.lm._getflags(data, self.pos, zeropos)
>> > +        self.pos += 1
>> > +        return (data[pos:zeropos], hashval, flags)
>> > +
>> > +def unhexlify(data, extra, pos, length):
>> > +    s = data[pos:pos + length].decode('hex')
>> > +    if extra:
>> > +        s += chr(extra & 0xff)
>> > +    return s
>> > +
>> > +def _cmp(a, b):
>> > +    return (a > b) - (a < b)
>> > +
>> > +class _lazymanifest(object):
>> > +    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
>> > +        if positions is None:
>> > +            self.positions = self.findlines(data)
>> > +            self.extrainfo = [0] * len(self.positions)
>> > +            self.data = data
>> > +            self.extradata = []
>> > +        else:
>> > +            self.positions = positions[:]
>> > +            self.extrainfo = extrainfo[:]
>> > +            self.extradata = extradata[:]
>> > +            self.data = data
>> > +
>> > +    def findlines(self, data):
>> > +        if not data:
>> > +            return []
>> > +        pos = data.find("\n")
>> > +        positions = [0]
>> > +        while pos < len(data) - 1 and pos != -1:
>> > +            positions.append(pos + 1)
>> > +            pos = data.find("\n", pos + 1)
>> > +        return positions
>> > +
>> > +    def _get(self, index):
>> > +        # get the position encoded in pos:
>> > +        #   positive number is an index in 'data'
>> > +        #   negative number is in extrapieces
>> > +        pos = self.positions[index]
>> > +        if pos >= 0:
>> > +            return self.data, pos
>> > +        return self.extradata[-pos - 1], -1
>> > +
>> > +    def _getkey(self, pos):
>> > +        if pos >= 0:
>> > +            return self.data[pos:self.data.find('\x00', pos + 1)]
>> > +        return self.extradata[-pos - 1][0]
>> > +
>> > +    def bsearch(self, key):
>> > +        first = 0
>> > +        last = len(self.positions) - 1
>> > +
>> > +        while first <= last:
>> > +            midpoint = (first + last)//2
>> > +            nextpos = self.positions[midpoint]
>> > +            candidate = self._getkey(nextpos)
>> > +            r = _cmp(key, candidate)
>> > +            if r == 0:
>> > +                return midpoint
>> > +            else:
>> > +                if r < 0:
>> > +                    last = midpoint - 1
>> > +                else:
>> > +                    first = midpoint + 1
>> > +        return -1
>> > +
>> > +    def bsearch2(self, key):
>> > +        # same as the above, but will always return the position
>> > +        # done for performance reasons
>> > +        first = 0
>> > +        last = len(self.positions) - 1
>> > +
>> > +        while first <= last:
>> > +            midpoint = (first + last)//2
>> > +            nextpos = self.positions[midpoint]
>> > +            candidate = self._getkey(nextpos)
>> > +            r = _cmp(key, candidate)
>> > +            if r == 0:
>> > +                return (midpoint, True)
>> > +            else:
>> > +                if r < 0:
>> > +                    last = midpoint - 1
>> > +                else:
>> > +                    first = midpoint + 1
>> > +        return (first, False)
>> > +
>> > +    def __contains__(self, key):
>> > +        return self.bsearch(key) != -1
>> > +
>> > +    def _getflags(self, data, needle, pos):
>> > +        start = pos + 41
>> > +        end = data.find("\n", start)
>> > +        if end == -1:
>> > +            end = len(data) - 1
>> > +        if start == end:
>> > +            return ''
>> > +        return self.data[start:end]
>> > +
>> > +    def __getitem__(self, key):
>> > +        if not isinstance(key, str):
>> > +            raise TypeError("getitem: manifest keys must be a string.")
>> > +        needle = self.bsearch(key)
>> > +        if needle == -1:
>> > +            raise KeyError
>> > +        data, pos = self._get(needle)
>> > +        if pos == -1:
>> > +            return (data[1], data[2])
>> > +        zeropos = data.find('\x00', pos)
>> > +        assert 0 <= needle <= len(self.positions)
>> > +        assert len(self.extrainfo) == len(self.positions)
>> > +        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
>> > +        flags = self._getflags(data, needle, zeropos)
>> > +        return (hashval, flags)
>> > +
>> > +    def __delitem__(self, key):
>> > +        needle, found = self.bsearch2(key)
>> > +        if not found:
>> > +            raise KeyError
>> > +        cur = self.positions[needle]
>> > +        self.positions = self.positions[:needle] + self.positions[needle + 1:]
>> > +        self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
>> > +        if cur >= 0:
>> > +            self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
>> > +
>> > +    def __setitem__(self, key, value):
>> > +        if not isinstance(key, str):
>> > +            raise TypeError("setitem: manifest keys must be a string.")
>> > +        if not isinstance(value, tuple) or len(value) != 2:
>> > +            raise TypeError("Manifest values must be a tuple of (node, flags).")
>> > +        hashval = value[0]
>> > +        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
>> > +            raise TypeError("node must be a 20-byte string")
>> > +        flags = value[1]
>> > +        if not isinstance(flags, str) or len(flags) > 1:
>> > +            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
>> > +        needle, found = self.bsearch2(key)
>> > +        if found:
>> > +            # put the item
>> > +            pos = self.positions[needle]
>> > +            if pos < 0:
>> > +                self.extradata[-pos - 1] = (key, value[0], value[1])
>> > +            else:
>> > +                # just don't bother
>> > +                self.extradata.append((key, value[0], value[1]))
>> > +                self.positions[needle] = -len(self.extradata)
>> > +        else:
>> > +            # not found, put it in with extra positions
>> > +            self.extradata.append((key, value[0], value[1]))
>> > +            self.positions = (self.positions[:needle] + [-len(self.extradata)]
>> > +                              + self.positions[needle:])
>> > +            self.extrainfo = (self.extrainfo[:needle] + [0] +
>> > +                              self.extrainfo[needle:])
>> >
>> >      def copy(self):
>> > -        c = _lazymanifest('')
>> > -        c.update(self)
>> > -        return c
>> > +        # XXX call _compact like in C?
>> > +        return _lazymanifest(self.data, self.positions, self.extrainfo,
>> > +            self.extradata)
>> > +
>> > +    def _compact(self):
>> > +        # hopefully not called TOO often
>> > +        def findnextposition(positions, start, lgt):
>> > +            i = start
>> > +            while i < len(positions):
>> > +                if positions[i] >= 0:
>> > +                    return positions[i]
>> > +                i += 1
>> > +            return lgt
>> > +
>> > +        if len(self.extradata) == 0:
>> > +            return
>> > +        l = []
>> > +        last_cut = 0
>> > +        i = 0
>> > +        offset = 0
>> > +        self.extrainfo = [0] * len(self.positions)
>> > +        while i < len(self.positions):
>> > +            if self.positions[i] >= 0:
>> > +                cur = self.positions[i]
>> > +                last_cut = cur
>> > +                while True:
>> > +                    self.positions[i] = offset
>> > +                    i += 1
>> > +                    if i == len(self.positions) or self.positions[i] < 0:
>> > +                        break
>> > +                    offset += self.positions[i] - cur
>> > +                    cur = self.positions[i]
>> > +                end_cut = findnextposition(self.positions, i, len(self.data))
>> > +                offset += end_cut - cur
>> > +                l.append(self.data[last_cut:end_cut])
>> > +            else:
>> > +                while i < len(self.positions) and self.positions[i] < 0:
>> > +                    cur = self.positions[i]
>> > +                    t = self.extradata[-cur - 1]
>> > +                    l.append(self._pack(t))
>> > +                    self.positions[i] = offset
>> > +                    if len(t[1]) > 20:
>> > +                        self.extrainfo[i] = ord(t[1][21])
>> > +                    offset += len(l[-1])
>> > +                    i += 1
>> > +        self.data = ''.join(l)
>> > +        self.extradata = []
>> > +
>> > +    def _pack(self, d):
>> > +        return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
>> > +
>> > +    def text(self):
>> > +        self._compact()
>> > +        return self.data
>> >
>> >      def diff(self, m2, clean=False):
>> >          '''Finds changes between the current manifest and m2.'''
>> > +        # XXX think whether efficiency matters here
>> >          diff = {}
>> >
>> > -        for fn, e1 in self.iteritems():
>> > +        for fn, e1, flags in self.iterentries():
>> >              if fn not in m2:
>> > -                diff[fn] = e1, (None, '')
>> > +                diff[fn] = (e1, flags), (None, '')
>> >              else:
>> >                  e2 = m2[fn]
>> > -                if e1 != e2:
>> > -                    diff[fn] = e1, e2
>> > +                if (e1, flags) != e2:
>> > +                    diff[fn] = (e1, flags), e2
>> >                  elif clean:
>> >                      diff[fn] = None
>> >
>> > -        for fn, e2 in m2.iteritems():
>> > +        for fn, e2, flags in m2.iterentries():
>> >              if fn not in self:
>> > -                diff[fn] = (None, ''), e2
>> > +                diff[fn] = (None, ''), (e2, flags)
>> >
>> >          return diff
>> >
>> > +    def iterentries(self):
>> > +        return lazymanifestiterentries(self)
>> > +
>> > +    def iterkeys(self):
>> > +        return lazymanifestiter(self)
>> > +
>> > +    def __iter__(self):
>> > +        return lazymanifestiter(self)
>> > +
>> > +    def __len__(self):
>> > +        return len(self.positions)
>> > +
>> >      def filtercopy(self, filterfn):
>> > +        # XXX should be optimized
>> >          c = _lazymanifest('')
>> >          for f, n, fl in self.iterentries():
>> >              if filterfn(f):
>> >                  c[f] = n, fl
>> >          return c
>> >
>> > -    def text(self):
>> > -        """Get the full data of this manifest as a bytestring."""
>> > -        return _textv1(self.iterentries())
>> > -
>> >  try:
>> >      _lazymanifest = parsers.lazymanifest
>> >  except AttributeError:
>> > diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
>> > --- a/mercurial/pure/bdiff.py
>> > +++ b/mercurial/pure/bdiff.py
>> > @@ -111,8 +111,8 @@
>> >          def blocks(sa, sb):
>> >              a = ffi.new("struct bdiff_line**")
>> >              b = ffi.new("struct bdiff_line**")
>> > -            ac = ffi.new("char[]", sa)
>> > -            bc = ffi.new("char[]", sb)
>> > +            ac = ffi.new("char[]", str(sa))
>> > +            bc = ffi.new("char[]", str(sb))
>> >              l = ffi.new("struct bdiff_hunk*")
>> >              try:
>> >                  an = lib.bdiff_splitlines(ac, len(sa), a)
>> > @@ -138,8 +138,8 @@
>> >          def bdiff(sa, sb):
>> >              a = ffi.new("struct bdiff_line**")
>> >              b = ffi.new("struct bdiff_line**")
>> > -            ac = ffi.new("char[]", sa)
>> > -            bc = ffi.new("char[]", sb)
>> > +            ac = ffi.new("char[]", str(sa))
>> > +            bc = ffi.new("char[]", str(sb))
>> >              l = ffi.new("struct bdiff_hunk*")
>> >              try:
>> >                  an = lib.bdiff_splitlines(ac, len(sa), a)
>> _______________________________________________
>> Mercurial-devel mailing list
>> Mercurial-devel@mercurial-scm.org
>> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Gregory Szorc - Sept. 25, 2016, 8:28 a.m.
./run-tests.py -l -d test.t

If you have a breakpoint set, it should just work (unless server processes detached from stdin are involved).

> On Sep 25, 2016, at 00:13, Maciej Fijalkowski <fijall@gmail.com> wrote:
> 
> I must say (and you are going to say that those are idle complaints)
> that asserts and prints are far inferior than a proper debugger which
> is surprisingly hard to use in the case of mercurial tests
> 
>> On Tue, Sep 20, 2016 at 4:39 PM, Augie Fackler <raf@durin42.com> wrote:
>>> On Sat, Sep 17, 2016 at 08:23:36PM +0200, Maciej Fijalkowski wrote:
>>> This should fix the issues presented.
>>> 
>>> There is one problem which is that the hash in test-rebase-detach
>>> changes. The way I see it is just an RNG phase-shift, but it might be
>>> a real bug. Some actual input will be appreciated.
>> 
>> It's not possibly RNG related - there's no RNG that goes into the sha1
>> of a node. I've added --debug to the failure in test-rebase-detach and
>> compared them between cpython and pypy:
>> 
>> cpython:
>> @@ -366,11 +366,24 @@
>> 
>> 
>>   $ hg log --rev tip --debug
>> -  changeset:   8:9472f4b1d736
>> +  changeset:   8:9472f4b1d7366902d0098aa1401e3f170000cc56
>>   tag:         tip
>> +  phase:       draft
>> +  parent:      7:02de42196ebee42ef284b6780a87cdc96e8eaab6
>> +  parent:      -1:0000000000000000000000000000000000000000
>> +  manifest:    8:8dbdcf066a97239fde2d0dba19fcb1e699fa66d2
>>   user:        test
>>   date:        Thu Jan 01 00:00:00 1970 +0000
>> -  summary:     Collapsed revision
>> +  files:       F
>> +  files+:      E
>> +  extra:       branch=default
>> +  extra:       rebase_source=9427d4d5af81c393e6b630129712cde8cdad5605
>> +  description:
>> +  Collapsed revision
>> +  * I
>> +  * Merge
>> +  * J
>> +
>> 
>> 
>> pypy:
>> @@ -366,11 +366,24 @@
>> 
>> 
>>   $ hg log --rev tip --debug
>> -  changeset:   8:9472f4b1d736
>> +  changeset:   8:122ceff3b303b13e5ca323742a8ebe589b9f8066
>>   tag:         tip
>> +  phase:       draft
>> +  parent:      7:02de42196ebee42ef284b6780a87cdc96e8eaab6
>> +  parent:      -1:0000000000000000000000000000000000000000
>> +  manifest:    8:09f1dc369d2667dcaeb9cf828b0fcb76bda29f33
>>   user:        test
>>   date:        Thu Jan 01 00:00:00 1970 +0000
>> -  summary:     Collapsed revision
>> +  files:       F
>> +  files+:      E
>> +  extra:       branch=default
>> +  extra:       rebase_source=9427d4d5af81c393e6b630129712cde8cdad5605
>> +  description:
>> +  Collapsed revision
>> +  * I
>> +  * Merge
>> +  * J
>> +
>> 
>> Which means the manifest is hashing out differently. That suggests
>> there's a subtle bug in how the manifest is hashing out in the pypy
>> version. Adding a `hg debugdata -m 8` (which prints raw manifest
>> data), and this is what I get on cpython:
>> 
>>   $ hg debugdata -m 8
>> +  A\x0045f17b21388f07b8939b22052e5f3776e5246388 (esc)
>> +  E\x00c3b9643004a8d1c7f39b0025cefab53dc8f7dc12 (esc)
>> +  F\x00293a00dc38ac20f042c67ba8534eedc1b6a7ae15 (esc)
>> +  H\x008500189e74a9e0475e822093bc7db0d631aeb0b4 (esc)
>> 
>> on pypy:
>>   $ hg debugdata -m 8
>> +  A\x0045f17b21388f07b8939b22052e5f3776e5246388 (esc)
>> +  F\x0022bfcfd62a21a3287edbd4d656218d0f525ed76a (esc)
>> +  E\x00c3b9643004a8d1c7f39b0025cefab53dc8f7dc12 (esc)
>> +  F\x00293a00dc38ac20f042c67ba8534eedc1b6a7ae15 (esc)
>> +  H\x008500189e74a9e0475e822093bc7db0d631aeb0b4 (esc)
>> 
>> 
>> So you've got a subtle bug here where the compaction of the
>> lazymanifest isn't working right - probably around merging manifests
>> somehow. Does this give you enough to get started?
>> 
>> (I'd probably start debugging this by littering _lazymanifest._compact
>> and _lazymanifest.text with asserts until I managed to trip over the
>> un-sorted lines, and then backtrack from there to figure out how they
>> managed to survive.)
>> 
>>> 
>>> 
>>> 
>>>> On Sat, Sep 17, 2016 at 8:22 PM, Maciej Fijalkowski <fijall@gmail.com> wrote:
>>>> # HG changeset patch
>>>> # User Maciej Fijalkowski <fijall@gmail.com>
>>>> # Date 1473680234 -7200
>>>> #      Mon Sep 12 13:37:14 2016 +0200
>>>> # Node ID 7551f1e60b2155462d89a9571eec065e9f67debc
>>>> # Parent  df05c43bd1e64f1620d0b2e502f4603c1e5a8341
>>>> lazymanifest: write a more efficient, pypy friendly version of lazymanifest
>>>> 
>>>> diff --git a/mercurial/manifest.py b/mercurial/manifest.py
>>>> --- a/mercurial/manifest.py
>>>> +++ b/mercurial/manifest.py
>>>> @@ -104,69 +104,297 @@
>>>>     _checkforbidden(files)
>>>>     return ''.join(lines)
>>>> 
>>>> -class _lazymanifest(dict):
>>>> -    """This is the pure implementation of lazymanifest.
>>>> -
>>>> -    It has not been optimized *at all* and is not lazy.
>>>> -    """
>>>> -
>>>> -    def __init__(self, data):
>>>> -        dict.__init__(self)
>>>> -        for f, n, fl in _parse(data):
>>>> -            self[f] = n, fl
>>>> -
>>>> -    def __setitem__(self, k, v):
>>>> -        node, flag = v
>>>> -        assert node is not None
>>>> -        if len(node) > 21:
>>>> -            node = node[:21] # match c implementation behavior
>>>> -        dict.__setitem__(self, k, (node, flag))
>>>> +class lazymanifestiter(object):
>>>> +    def __init__(self, lm):
>>>> +        self.pos = 0
>>>> +        self.lm = lm
>>>> 
>>>>     def __iter__(self):
>>>> -        return iter(sorted(dict.keys(self)))
>>>> +        return self
>>>> 
>>>> -    def iterkeys(self):
>>>> -        return iter(sorted(dict.keys(self)))
>>>> +    def next(self):
>>>> +        try:
>>>> +            data, pos = self.lm._get(self.pos)
>>>> +        except IndexError:
>>>> +            raise StopIteration
>>>> +        if pos == -1:
>>>> +            self.pos += 1
>>>> +            return data[0]
>>>> +        self.pos += 1
>>>> +        zeropos = data.find('\x00', pos)
>>>> +        return data[pos:zeropos]
>>>> 
>>>> -    def iterentries(self):
>>>> -        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
>>>> +class lazymanifestiterentries(object):
>>>> +    def __init__(self, lm):
>>>> +        self.lm = lm
>>>> +        self.pos = 0
>>>> +
>>>> +    def __iter__(self):
>>>> +        return self
>>>> +
>>>> +    def next(self):
>>>> +        try:
>>>> +            data, pos = self.lm._get(self.pos)
>>>> +        except IndexError:
>>>> +            raise StopIteration
>>>> +        if pos == -1:
>>>> +            self.pos += 1
>>>> +            return data
>>>> +        zeropos = data.find('\x00', pos)
>>>> +        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
>>>> +                            zeropos + 1, 40)
>>>> +        flags = self.lm._getflags(data, self.pos, zeropos)
>>>> +        self.pos += 1
>>>> +        return (data[pos:zeropos], hashval, flags)
>>>> +
>>>> +def unhexlify(data, extra, pos, length):
>>>> +    s = data[pos:pos + length].decode('hex')
>>>> +    if extra:
>>>> +        s += chr(extra & 0xff)
>>>> +    return s
>>>> +
>>>> +def _cmp(a, b):
>>>> +    return (a > b) - (a < b)
>>>> +
>>>> +class _lazymanifest(object):
>>>> +    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
>>>> +        if positions is None:
>>>> +            self.positions = self.findlines(data)
>>>> +            self.extrainfo = [0] * len(self.positions)
>>>> +            self.data = data
>>>> +            self.extradata = []
>>>> +        else:
>>>> +            self.positions = positions[:]
>>>> +            self.extrainfo = extrainfo[:]
>>>> +            self.extradata = extradata[:]
>>>> +            self.data = data
>>>> +
>>>> +    def findlines(self, data):
>>>> +        if not data:
>>>> +            return []
>>>> +        pos = data.find("\n")
>>>> +        positions = [0]
>>>> +        while pos < len(data) - 1 and pos != -1:
>>>> +            positions.append(pos + 1)
>>>> +            pos = data.find("\n", pos + 1)
>>>> +        return positions
>>>> +
>>>> +    def _get(self, index):
>>>> +        # get the position encoded in pos:
>>>> +        #   positive number is an index in 'data'
>>>> +        #   negative number is in extrapieces
>>>> +        pos = self.positions[index]
>>>> +        if pos >= 0:
>>>> +            return self.data, pos
>>>> +        return self.extradata[-pos - 1], -1
>>>> +
>>>> +    def _getkey(self, pos):
>>>> +        if pos >= 0:
>>>> +            return self.data[pos:self.data.find('\x00', pos + 1)]
>>>> +        return self.extradata[-pos - 1][0]
>>>> +
>>>> +    def bsearch(self, key):
>>>> +        first = 0
>>>> +        last = len(self.positions) - 1
>>>> +
>>>> +        while first <= last:
>>>> +            midpoint = (first + last)//2
>>>> +            nextpos = self.positions[midpoint]
>>>> +            candidate = self._getkey(nextpos)
>>>> +            r = _cmp(key, candidate)
>>>> +            if r == 0:
>>>> +                return midpoint
>>>> +            else:
>>>> +                if r < 0:
>>>> +                    last = midpoint - 1
>>>> +                else:
>>>> +                    first = midpoint + 1
>>>> +        return -1
>>>> +
>>>> +    def bsearch2(self, key):
>>>> +        # same as the above, but will always return the position
>>>> +        # done for performance reasons
>>>> +        first = 0
>>>> +        last = len(self.positions) - 1
>>>> +
>>>> +        while first <= last:
>>>> +            midpoint = (first + last)//2
>>>> +            nextpos = self.positions[midpoint]
>>>> +            candidate = self._getkey(nextpos)
>>>> +            r = _cmp(key, candidate)
>>>> +            if r == 0:
>>>> +                return (midpoint, True)
>>>> +            else:
>>>> +                if r < 0:
>>>> +                    last = midpoint - 1
>>>> +                else:
>>>> +                    first = midpoint + 1
>>>> +        return (first, False)
>>>> +
>>>> +    def __contains__(self, key):
>>>> +        return self.bsearch(key) != -1
>>>> +
>>>> +    def _getflags(self, data, needle, pos):
>>>> +        start = pos + 41
>>>> +        end = data.find("\n", start)
>>>> +        if end == -1:
>>>> +            end = len(data) - 1
>>>> +        if start == end:
>>>> +            return ''
>>>> +        return self.data[start:end]
>>>> +
>>>> +    def __getitem__(self, key):
>>>> +        if not isinstance(key, str):
>>>> +            raise TypeError("getitem: manifest keys must be a string.")
>>>> +        needle = self.bsearch(key)
>>>> +        if needle == -1:
>>>> +            raise KeyError
>>>> +        data, pos = self._get(needle)
>>>> +        if pos == -1:
>>>> +            return (data[1], data[2])
>>>> +        zeropos = data.find('\x00', pos)
>>>> +        assert 0 <= needle <= len(self.positions)
>>>> +        assert len(self.extrainfo) == len(self.positions)
>>>> +        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
>>>> +        flags = self._getflags(data, needle, zeropos)
>>>> +        return (hashval, flags)
>>>> +
>>>> +    def __delitem__(self, key):
>>>> +        needle, found = self.bsearch2(key)
>>>> +        if not found:
>>>> +            raise KeyError
>>>> +        cur = self.positions[needle]
>>>> +        self.positions = self.positions[:needle] + self.positions[needle + 1:]
>>>> +        self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
>>>> +        if cur >= 0:
>>>> +            self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
>>>> +
>>>> +    def __setitem__(self, key, value):
>>>> +        if not isinstance(key, str):
>>>> +            raise TypeError("setitem: manifest keys must be a string.")
>>>> +        if not isinstance(value, tuple) or len(value) != 2:
>>>> +            raise TypeError("Manifest values must be a tuple of (node, flags).")
>>>> +        hashval = value[0]
>>>> +        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
>>>> +            raise TypeError("node must be a 20-byte string")
>>>> +        flags = value[1]
>>>> +        if not isinstance(flags, str) or len(flags) > 1:
>>>> +            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
>>>> +        needle, found = self.bsearch2(key)
>>>> +        if found:
>>>> +            # put the item
>>>> +            pos = self.positions[needle]
>>>> +            if pos < 0:
>>>> +                self.extradata[-pos - 1] = (key, value[0], value[1])
>>>> +            else:
>>>> +                # just don't bother
>>>> +                self.extradata.append((key, value[0], value[1]))
>>>> +                self.positions[needle] = -len(self.extradata)
>>>> +        else:
>>>> +            # not found, put it in with extra positions
>>>> +            self.extradata.append((key, value[0], value[1]))
>>>> +            self.positions = (self.positions[:needle] + [-len(self.extradata)]
>>>> +                              + self.positions[needle:])
>>>> +            self.extrainfo = (self.extrainfo[:needle] + [0] +
>>>> +                              self.extrainfo[needle:])
>>>> 
>>>>     def copy(self):
>>>> -        c = _lazymanifest('')
>>>> -        c.update(self)
>>>> -        return c
>>>> +        # XXX call _compact like in C?
>>>> +        return _lazymanifest(self.data, self.positions, self.extrainfo,
>>>> +            self.extradata)
>>>> +
>>>> +    def _compact(self):
>>>> +        # hopefully not called TOO often
>>>> +        def findnextposition(positions, start, lgt):
>>>> +            i = start
>>>> +            while i < len(positions):
>>>> +                if positions[i] >= 0:
>>>> +                    return positions[i]
>>>> +                i += 1
>>>> +            return lgt
>>>> +
>>>> +        if len(self.extradata) == 0:
>>>> +            return
>>>> +        l = []
>>>> +        last_cut = 0
>>>> +        i = 0
>>>> +        offset = 0
>>>> +        self.extrainfo = [0] * len(self.positions)
>>>> +        while i < len(self.positions):
>>>> +            if self.positions[i] >= 0:
>>>> +                cur = self.positions[i]
>>>> +                last_cut = cur
>>>> +                while True:
>>>> +                    self.positions[i] = offset
>>>> +                    i += 1
>>>> +                    if i == len(self.positions) or self.positions[i] < 0:
>>>> +                        break
>>>> +                    offset += self.positions[i] - cur
>>>> +                    cur = self.positions[i]
>>>> +                end_cut = findnextposition(self.positions, i, len(self.data))
>>>> +                offset += end_cut - cur
>>>> +                l.append(self.data[last_cut:end_cut])
>>>> +            else:
>>>> +                while i < len(self.positions) and self.positions[i] < 0:
>>>> +                    cur = self.positions[i]
>>>> +                    t = self.extradata[-cur - 1]
>>>> +                    l.append(self._pack(t))
>>>> +                    self.positions[i] = offset
>>>> +                    if len(t[1]) > 20:
>>>> +                        self.extrainfo[i] = ord(t[1][21])
>>>> +                    offset += len(l[-1])
>>>> +                    i += 1
>>>> +        self.data = ''.join(l)
>>>> +        self.extradata = []
>>>> +
>>>> +    def _pack(self, d):
>>>> +        return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
>>>> +
>>>> +    def text(self):
>>>> +        self._compact()
>>>> +        return self.data
>>>> 
>>>>     def diff(self, m2, clean=False):
>>>>         '''Finds changes between the current manifest and m2.'''
>>>> +        # XXX think whether efficiency matters here
>>>>         diff = {}
>>>> 
>>>> -        for fn, e1 in self.iteritems():
>>>> +        for fn, e1, flags in self.iterentries():
>>>>             if fn not in m2:
>>>> -                diff[fn] = e1, (None, '')
>>>> +                diff[fn] = (e1, flags), (None, '')
>>>>             else:
>>>>                 e2 = m2[fn]
>>>> -                if e1 != e2:
>>>> -                    diff[fn] = e1, e2
>>>> +                if (e1, flags) != e2:
>>>> +                    diff[fn] = (e1, flags), e2
>>>>                 elif clean:
>>>>                     diff[fn] = None
>>>> 
>>>> -        for fn, e2 in m2.iteritems():
>>>> +        for fn, e2, flags in m2.iterentries():
>>>>             if fn not in self:
>>>> -                diff[fn] = (None, ''), e2
>>>> +                diff[fn] = (None, ''), (e2, flags)
>>>> 
>>>>         return diff
>>>> 
>>>> +    def iterentries(self):
>>>> +        return lazymanifestiterentries(self)
>>>> +
>>>> +    def iterkeys(self):
>>>> +        return lazymanifestiter(self)
>>>> +
>>>> +    def __iter__(self):
>>>> +        return lazymanifestiter(self)
>>>> +
>>>> +    def __len__(self):
>>>> +        return len(self.positions)
>>>> +
>>>>     def filtercopy(self, filterfn):
>>>> +        # XXX should be optimized
>>>>         c = _lazymanifest('')
>>>>         for f, n, fl in self.iterentries():
>>>>             if filterfn(f):
>>>>                 c[f] = n, fl
>>>>         return c
>>>> 
>>>> -    def text(self):
>>>> -        """Get the full data of this manifest as a bytestring."""
>>>> -        return _textv1(self.iterentries())
>>>> -
>>>> try:
>>>>     _lazymanifest = parsers.lazymanifest
>>>> except AttributeError:
>>>> diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
>>>> --- a/mercurial/pure/bdiff.py
>>>> +++ b/mercurial/pure/bdiff.py
>>>> @@ -111,8 +111,8 @@
>>>>         def blocks(sa, sb):
>>>>             a = ffi.new("struct bdiff_line**")
>>>>             b = ffi.new("struct bdiff_line**")
>>>> -            ac = ffi.new("char[]", sa)
>>>> -            bc = ffi.new("char[]", sb)
>>>> +            ac = ffi.new("char[]", str(sa))
>>>> +            bc = ffi.new("char[]", str(sb))
>>>>             l = ffi.new("struct bdiff_hunk*")
>>>>             try:
>>>>                 an = lib.bdiff_splitlines(ac, len(sa), a)
>>>> @@ -138,8 +138,8 @@
>>>>         def bdiff(sa, sb):
>>>>             a = ffi.new("struct bdiff_line**")
>>>>             b = ffi.new("struct bdiff_line**")
>>>> -            ac = ffi.new("char[]", sa)
>>>> -            bc = ffi.new("char[]", sb)
>>>> +            ac = ffi.new("char[]", str(sa))
>>>> +            bc = ffi.new("char[]", str(sb))
>>>>             l = ffi.new("struct bdiff_hunk*")
>>>>             try:
>>>>                 an = lib.bdiff_splitlines(ac, len(sa), a)
>>> _______________________________________________
>>> Mercurial-devel mailing list
>>> Mercurial-devel@mercurial-scm.org
>>> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Jun Wu - Sept. 25, 2016, 11:03 a.m.
Excerpts from Maciej Fijalkowski's message of 2016-09-25 09:13:29 +0200:
> a proper debugger which is surprisingly hard to use in the case of
> mercurial tests

If you mean ipdb cannot be used together with run-tests.py, it could be
solved by changing its I/O to /dev/tty, like:

    from IPython.core.debugger import Pdb
    originit = Pdb.__init__
    def pdbinit(*args, **kwargs):
        fin = open('/dev/tty', 'r')
        fout = open('/dev/tty', 'w')
        originit(*args, stdin=fin, stdout=fout, **kwargs)
    Pdb.__init__ = pdbinit
    import ipdb; ipdb.set_trace()
Maciej Fijalkowski - Sept. 28, 2016, 11:48 a.m.
cool, thanks guys!

I've sent a new version that should pass all the tests.

On Sun, Sep 25, 2016 at 1:03 PM, Jun Wu <quark@fb.com> wrote:
> Excerpts from Maciej Fijalkowski's message of 2016-09-25 09:13:29 +0200:
>> a proper debugger which is surprisingly hard to use in the case of
>> mercurial tests
>
> If you mean ipdb cannot be used together with run-tests.py, it could be
> solved by changing its I/O to /dev/tty, like:
>
>     from IPython.core.debugger import Pdb
>     originit = Pdb.__init__
>     def pdbinit(*args, **kwargs):
>         fin = open('/dev/tty', 'r')
>         fout = open('/dev/tty', 'w')
>         originit(*args, stdin=fin, stdout=fout, **kwargs)
>     Pdb.__init__ = pdbinit
>     import ipdb; ipdb.set_trace()

Patch

diff --git a/mercurial/manifest.py b/mercurial/manifest.py
--- a/mercurial/manifest.py
+++ b/mercurial/manifest.py
@@ -104,69 +104,297 @@ 
     _checkforbidden(files)
     return ''.join(lines)
 
-class _lazymanifest(dict):
-    """This is the pure implementation of lazymanifest.
-
-    It has not been optimized *at all* and is not lazy.
-    """
-
-    def __init__(self, data):
-        dict.__init__(self)
-        for f, n, fl in _parse(data):
-            self[f] = n, fl
-
-    def __setitem__(self, k, v):
-        node, flag = v
-        assert node is not None
-        if len(node) > 21:
-            node = node[:21] # match c implementation behavior
-        dict.__setitem__(self, k, (node, flag))
+class lazymanifestiter(object):
+    def __init__(self, lm):
+        self.pos = 0
+        self.lm = lm
 
     def __iter__(self):
-        return iter(sorted(dict.keys(self)))
+        return self
 
-    def iterkeys(self):
-        return iter(sorted(dict.keys(self)))
+    def next(self):
+        try:
+            data, pos = self.lm._get(self.pos)
+        except IndexError:
+            raise StopIteration
+        if pos == -1:
+            self.pos += 1
+            return data[0]
+        self.pos += 1
+        zeropos = data.find('\x00', pos)
+        return data[pos:zeropos]
 
-    def iterentries(self):
-        return ((f, e[0], e[1]) for f, e in sorted(self.iteritems()))
+class lazymanifestiterentries(object):
+    def __init__(self, lm):
+        self.lm = lm
+        self.pos = 0
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        try:
+            data, pos = self.lm._get(self.pos)
+        except IndexError:
+            raise StopIteration
+        if pos == -1:
+            self.pos += 1
+            return data
+        zeropos = data.find('\x00', pos)
+        hashval = unhexlify(data, self.lm.extrainfo[self.pos],
+                            zeropos + 1, 40)
+        flags = self.lm._getflags(data, self.pos, zeropos)
+        self.pos += 1
+        return (data[pos:zeropos], hashval, flags)
+
+def unhexlify(data, extra, pos, length):
+    s = data[pos:pos + length].decode('hex')
+    if extra:
+        s += chr(extra & 0xff)
+    return s
+
+def _cmp(a, b):
+    return (a > b) - (a < b)
+
+class _lazymanifest(object):
+    def __init__(self, data, positions=None, extrainfo=None, extradata=None):
+        if positions is None:
+            self.positions = self.findlines(data)
+            self.extrainfo = [0] * len(self.positions)
+            self.data = data
+            self.extradata = []
+        else:
+            self.positions = positions[:]
+            self.extrainfo = extrainfo[:]
+            self.extradata = extradata[:]
+            self.data = data
+
+    def findlines(self, data):
+        if not data:
+            return []
+        pos = data.find("\n")
+        positions = [0]
+        while pos < len(data) - 1 and pos != -1:
+            positions.append(pos + 1)
+            pos = data.find("\n", pos + 1)
+        return positions
+
+    def _get(self, index):
+        # get the position encoded in pos:
+        #   positive number is an index in 'data'
+        #   negative number is in extrapieces
+        pos = self.positions[index]
+        if pos >= 0:
+            return self.data, pos
+        return self.extradata[-pos - 1], -1
+
+    def _getkey(self, pos):
+        if pos >= 0:
+            return self.data[pos:self.data.find('\x00', pos + 1)]
+        return self.extradata[-pos - 1][0]
+
+    def bsearch(self, key):
+        first = 0
+        last = len(self.positions) - 1
+
+        while first <= last:
+            midpoint = (first + last)//2
+            nextpos = self.positions[midpoint]
+            candidate = self._getkey(nextpos)
+            r = _cmp(key, candidate)
+            if r == 0:
+                return midpoint
+            else:
+                if r < 0:
+                    last = midpoint - 1
+                else:
+                    first = midpoint + 1
+        return -1
+
+    def bsearch2(self, key):
+        # same as the above, but will always return the position
+        # done for performance reasons
+        first = 0
+        last = len(self.positions) - 1
+
+        while first <= last:
+            midpoint = (first + last)//2
+            nextpos = self.positions[midpoint]
+            candidate = self._getkey(nextpos)
+            r = _cmp(key, candidate)
+            if r == 0:
+                return (midpoint, True)
+            else:
+                if r < 0:
+                    last = midpoint - 1
+                else:
+                    first = midpoint + 1
+        return (first, False)
+
+    def __contains__(self, key):
+        return self.bsearch(key) != -1
+
+    def _getflags(self, data, needle, pos):
+        start = pos + 41
+        end = data.find("\n", start)
+        if end == -1:
+            end = len(data) - 1
+        if start == end:
+            return ''
+        return self.data[start:end]
+
+    def __getitem__(self, key):
+        if not isinstance(key, str):
+            raise TypeError("getitem: manifest keys must be a string.")
+        needle = self.bsearch(key)
+        if needle == -1:
+            raise KeyError
+        data, pos = self._get(needle)
+        if pos == -1:
+            return (data[1], data[2])
+        zeropos = data.find('\x00', pos)
+        assert 0 <= needle <= len(self.positions)
+        assert len(self.extrainfo) == len(self.positions)
+        hashval = unhexlify(data, self.extrainfo[needle], zeropos + 1, 40)
+        flags = self._getflags(data, needle, zeropos)
+        return (hashval, flags)
+
+    def __delitem__(self, key):
+        needle, found = self.bsearch2(key)
+        if not found:
+            raise KeyError
+        cur = self.positions[needle]
+        self.positions = self.positions[:needle] + self.positions[needle + 1:]
+        self.extrainfo = self.extrainfo[:needle] + self.extrainfo[needle + 1:]
+        if cur >= 0:
+            self.data = self.data[:cur] + '\x00' + self.data[cur + 1:]
+
+    def __setitem__(self, key, value):
+        if not isinstance(key, str):
+            raise TypeError("setitem: manifest keys must be a string.")
+        if not isinstance(value, tuple) or len(value) != 2:
+            raise TypeError("Manifest values must be a tuple of (node, flags).")
+        hashval = value[0]
+        if not isinstance(hashval, str) or not 20 <= len(hashval) <= 22:
+            raise TypeError("node must be a 20-byte string")
+        flags = value[1]
+        if not isinstance(flags, str) or len(flags) > 1:
+            raise TypeError("flags must a 0 or 1 byte string, got %r", flags)
+        needle, found = self.bsearch2(key)
+        if found:
+            # put the item
+            pos = self.positions[needle]
+            if pos < 0:
+                self.extradata[-pos - 1] = (key, value[0], value[1])
+            else:
+                # just don't bother
+                self.extradata.append((key, value[0], value[1]))
+                self.positions[needle] = -len(self.extradata)
+        else:
+            # not found, put it in with extra positions
+            self.extradata.append((key, value[0], value[1]))
+            self.positions = (self.positions[:needle] + [-len(self.extradata)]
+                              + self.positions[needle:])
+            self.extrainfo = (self.extrainfo[:needle] + [0] +
+                              self.extrainfo[needle:])
 
     def copy(self):
-        c = _lazymanifest('')
-        c.update(self)
-        return c
+        # XXX call _compact like in C?
+        return _lazymanifest(self.data, self.positions, self.extrainfo,
+            self.extradata)
+
+    def _compact(self):
+        # hopefully not called TOO often
+        def findnextposition(positions, start, lgt):
+            i = start
+            while i < len(positions):
+                if positions[i] >= 0:
+                    return positions[i]
+                i += 1
+            return lgt
+
+        if len(self.extradata) == 0:
+            return
+        l = []
+        last_cut = 0
+        i = 0
+        offset = 0
+        self.extrainfo = [0] * len(self.positions)
+        while i < len(self.positions):
+            if self.positions[i] >= 0:
+                cur = self.positions[i]
+                last_cut = cur
+                while True:
+                    self.positions[i] = offset
+                    i += 1
+                    if i == len(self.positions) or self.positions[i] < 0:
+                        break
+                    offset += self.positions[i] - cur
+                    cur = self.positions[i]
+                end_cut = findnextposition(self.positions, i, len(self.data))
+                offset += end_cut - cur
+                l.append(self.data[last_cut:end_cut])
+            else:
+                while i < len(self.positions) and self.positions[i] < 0:
+                    cur = self.positions[i]
+                    t = self.extradata[-cur - 1]
+                    l.append(self._pack(t))
+                    self.positions[i] = offset
+                    if len(t[1]) > 20:
+                        self.extrainfo[i] = ord(t[1][21])
+                    offset += len(l[-1])
+                    i += 1
+        self.data = ''.join(l)
+        self.extradata = []
+
+    def _pack(self, d):
+        return d[0] + '\x00' + d[1][:20].encode('hex') + d[2] + '\n'
+
+    def text(self):
+        self._compact()
+        return self.data
 
     def diff(self, m2, clean=False):
         '''Finds changes between the current manifest and m2.'''
+        # XXX think whether efficiency matters here
         diff = {}
 
-        for fn, e1 in self.iteritems():
+        for fn, e1, flags in self.iterentries():
             if fn not in m2:
-                diff[fn] = e1, (None, '')
+                diff[fn] = (e1, flags), (None, '')
             else:
                 e2 = m2[fn]
-                if e1 != e2:
-                    diff[fn] = e1, e2
+                if (e1, flags) != e2:
+                    diff[fn] = (e1, flags), e2
                 elif clean:
                     diff[fn] = None
 
-        for fn, e2 in m2.iteritems():
+        for fn, e2, flags in m2.iterentries():
             if fn not in self:
-                diff[fn] = (None, ''), e2
+                diff[fn] = (None, ''), (e2, flags)
 
         return diff
 
+    def iterentries(self):
+        return lazymanifestiterentries(self)
+
+    def iterkeys(self):
+        return lazymanifestiter(self)
+
+    def __iter__(self):
+        return lazymanifestiter(self)
+
+    def __len__(self):
+        return len(self.positions)
+
     def filtercopy(self, filterfn):
+        # XXX should be optimized
         c = _lazymanifest('')
         for f, n, fl in self.iterentries():
             if filterfn(f):
                 c[f] = n, fl
         return c
 
-    def text(self):
-        """Get the full data of this manifest as a bytestring."""
-        return _textv1(self.iterentries())
-
 try:
     _lazymanifest = parsers.lazymanifest
 except AttributeError:
diff --git a/mercurial/pure/bdiff.py b/mercurial/pure/bdiff.py
--- a/mercurial/pure/bdiff.py
+++ b/mercurial/pure/bdiff.py
@@ -111,8 +111,8 @@ 
         def blocks(sa, sb):
             a = ffi.new("struct bdiff_line**")
             b = ffi.new("struct bdiff_line**")
-            ac = ffi.new("char[]", sa)
-            bc = ffi.new("char[]", sb)
+            ac = ffi.new("char[]", str(sa))
+            bc = ffi.new("char[]", str(sb))
             l = ffi.new("struct bdiff_hunk*")
             try:
                 an = lib.bdiff_splitlines(ac, len(sa), a)
@@ -138,8 +138,8 @@ 
         def bdiff(sa, sb):
             a = ffi.new("struct bdiff_line**")
             b = ffi.new("struct bdiff_line**")
-            ac = ffi.new("char[]", sa)
-            bc = ffi.new("char[]", sb)
+            ac = ffi.new("char[]", str(sa))
+            bc = ffi.new("char[]", str(sb))
             l = ffi.new("struct bdiff_hunk*")
             try:
                 an = lib.bdiff_splitlines(ac, len(sa), a)