Patchwork [2,of,8,V7] bookmarks: introduce binary encoding

login
register
mail settings
Submitter Stanislau Hlebik
Date Nov. 2, 2016, 2:06 p.m.
Message ID <f3da841e3d47ccbb0be3.1478095604@dev1918.lla1.facebook.com>
Download mbox | patch
Permalink /patch/17278/
State Superseded
Headers show

Comments

Stanislau Hlebik - Nov. 2, 2016, 2:06 p.m.
# HG changeset patch
# User Stanislau Hlebik <stash@fb.com>
# Date 1478016027 25200
#      Tue Nov 01 09:00:27 2016 -0700
# Branch stable
# Node ID f3da841e3d47ccbb0be3892b521607c09fdeab13
# Parent  a56a624a8a42b93ec980d2c284756a38719dffe6
bookmarks: introduce binary encoding

Bookmarks binary encoding will be used for `bookmarks` bundle2 part.
The format is: <4 bytes - bookmark size, big-endian><bookmark>
               <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
BookmarksEncodeError and BookmarksDecodeError maybe thrown if input is
incorrect.
Augie Fackler - Nov. 9, 2016, 5:22 p.m.
On Wed, Nov 02, 2016 at 07:06:44AM -0700, Stanislau Hlebik wrote:
> # HG changeset patch
> # User Stanislau Hlebik <stash@fb.com>
> # Date 1478016027 25200
> #      Tue Nov 01 09:00:27 2016 -0700
> # Branch stable
> # Node ID f3da841e3d47ccbb0be3892b521607c09fdeab13
> # Parent  a56a624a8a42b93ec980d2c284756a38719dffe6
> bookmarks: introduce binary encoding
>
> Bookmarks binary encoding will be used for `bookmarks` bundle2 part.
> The format is: <4 bytes - bookmark size, big-endian><bookmark>
>                <1 byte - 0 if node is empty 1 otherwise><20 bytes node>

Enough other formats have 32 bytes of space for nodes (eg revlog) that
I suspect we should do 32 bytes here too. Thoughts?

> BookmarksEncodeError and BookmarksDecodeError maybe thrown if input is
> incorrect.
>
> diff --git a/mercurial/bookmarks.py b/mercurial/bookmarks.py
> --- a/mercurial/bookmarks.py
> +++ b/mercurial/bookmarks.py
> @@ -7,8 +7,10 @@
>
>  from __future__ import absolute_import
>
> +import StringIO
>  import errno
>  import os
> +import struct
>
>  from .i18n import _
>  from .node import (
> @@ -23,6 +25,70 @@
>      util,
>  )
>
> +_NONEMPTYNODE = struct.pack('?', False)
> +_EMPTYNODE = struct.pack('?', True)
> +
> +def _unpackbookmarksize(stream):
> +    """Returns 0 if stream is empty.
> +    """
> +
> +    expectedsize = struct.calcsize('>i')
> +    encodedbookmarksize = stream.read(expectedsize)
> +    if len(encodedbookmarksize) == 0:
> +        return 0
> +    if len(encodedbookmarksize) != expectedsize:
> +        raise ValueError(
> +            _('cannot decode bookmark size: '
> +              'expected size: %d, actual size: %d') %
> +            (expectedsize, len(encodedbookmarksize)))
> +    return struct.unpack('>i', encodedbookmarksize)[0]
> +
> +def encodebookmarks(bookmarks):
> +    """Encodes bookmark to node mapping to the byte string.
> +
> +    Format: <4 bytes - bookmark size><bookmark>
> +            <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
> +    Node may be 20 byte binary string, 40 byte hex string or empty
> +    """
> +
> +    for bookmark, node in bookmarks.iteritems():
> +        yield struct.pack('>i', (len(bookmark)))
> +        yield encoding.fromlocal(bookmark)
> +        if node:
> +            if len(node) != 20 and len(node) != 40:
> +                raise ValueError(_('node must be 20 or bytes long'))
> +            if len(node) == 40:
> +                node = bin(node)
> +            yield _NONEMPTYNODE
> +            yield node
> +        else:
> +            yield _EMPTYNODE
> +
> +def decodebookmarks(buf):
> +    """Decodes buffer into bookmark to node mapping.
> +
> +    Node is either 20 bytes or empty.
> +    """
> +
> +    stream = StringIO.StringIO(buf)
> +    bookmarks = {}
> +    bookmarksize = _unpackbookmarksize(stream)
> +    while bookmarksize:
> +        bookmark = stream.read(bookmarksize)
> +        if len(bookmark) != bookmarksize:
> +            raise ValueError(
> +                _('cannot decode bookmark: expected size: %d, '
> +                'actual size: %d') % (bookmarksize, len(bookmark)))
> +        bookmark = encoding.tolocal(bookmark)
> +        packedemptynodeflag = stream.read(struct.calcsize('?'))
> +        emptynode = struct.unpack('?', packedemptynodeflag)[0]
> +        node = ''
> +        if not emptynode:
> +            node = stream.read(20)
> +        bookmarks[bookmark] = node
> +        bookmarksize = _unpackbookmarksize(stream)
> +    return bookmarks
> +
>  def _getbkfile(repo):
>      """Hook so that extensions that mess with the store can hook bm storage.
>
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Stanislau Hlebik - Nov. 10, 2016, 9:24 a.m.
We can use 32 bytes, but only 20 bytes will be used.
What’s the point then?

On 11/9/16, 5:22 PM, "Augie Fackler" <raf@durin42.com> wrote:

    On Wed, Nov 02, 2016 at 07:06:44AM -0700, Stanislau Hlebik wrote:
    > # HG changeset patch

    > # User Stanislau Hlebik <stash@fb.com>

    > # Date 1478016027 25200

    > #      Tue Nov 01 09:00:27 2016 -0700

    > # Branch stable

    > # Node ID f3da841e3d47ccbb0be3892b521607c09fdeab13

    > # Parent  a56a624a8a42b93ec980d2c284756a38719dffe6

    > bookmarks: introduce binary encoding

    >

    > Bookmarks binary encoding will be used for `bookmarks` bundle2 part.

    > The format is: <4 bytes - bookmark size, big-endian><bookmark>

    >                <1 byte - 0 if node is empty 1 otherwise><20 bytes node>

    
    Enough other formats have 32 bytes of space for nodes (eg revlog) that
    I suspect we should do 32 bytes here too. Thoughts?
    
    > BookmarksEncodeError and BookmarksDecodeError maybe thrown if input is

    > incorrect.

    >

    > diff --git a/mercurial/bookmarks.py b/mercurial/bookmarks.py

    > --- a/mercurial/bookmarks.py

    > +++ b/mercurial/bookmarks.py

    > @@ -7,8 +7,10 @@

    >

    >  from __future__ import absolute_import

    >

    > +import StringIO

    >  import errno

    >  import os

    > +import struct

    >

    >  from .i18n import _

    >  from .node import (

    > @@ -23,6 +25,70 @@

    >      util,

    >  )

    >

    > +_NONEMPTYNODE = struct.pack('?', False)

    > +_EMPTYNODE = struct.pack('?', True)

    > +

    > +def _unpackbookmarksize(stream):

    > +    """Returns 0 if stream is empty.

    > +    """

    > +

    > +    expectedsize = struct.calcsize('>i')

    > +    encodedbookmarksize = stream.read(expectedsize)

    > +    if len(encodedbookmarksize) == 0:

    > +        return 0

    > +    if len(encodedbookmarksize) != expectedsize:

    > +        raise ValueError(

    > +            _('cannot decode bookmark size: '

    > +              'expected size: %d, actual size: %d') %

    > +            (expectedsize, len(encodedbookmarksize)))

    > +    return struct.unpack('>i', encodedbookmarksize)[0]

    > +

    > +def encodebookmarks(bookmarks):

    > +    """Encodes bookmark to node mapping to the byte string.

    > +

    > +    Format: <4 bytes - bookmark size><bookmark>

    > +            <1 byte - 0 if node is empty 1 otherwise><20 bytes node>

    > +    Node may be 20 byte binary string, 40 byte hex string or empty

    > +    """

    > +

    > +    for bookmark, node in bookmarks.iteritems():

    > +        yield struct.pack('>i', (len(bookmark)))

    > +        yield encoding.fromlocal(bookmark)

    > +        if node:

    > +            if len(node) != 20 and len(node) != 40:

    > +                raise ValueError(_('node must be 20 or bytes long'))

    > +            if len(node) == 40:

    > +                node = bin(node)

    > +            yield _NONEMPTYNODE

    > +            yield node

    > +        else:

    > +            yield _EMPTYNODE

    > +

    > +def decodebookmarks(buf):

    > +    """Decodes buffer into bookmark to node mapping.

    > +

    > +    Node is either 20 bytes or empty.

    > +    """

    > +

    > +    stream = StringIO.StringIO(buf)

    > +    bookmarks = {}

    > +    bookmarksize = _unpackbookmarksize(stream)

    > +    while bookmarksize:

    > +        bookmark = stream.read(bookmarksize)

    > +        if len(bookmark) != bookmarksize:

    > +            raise ValueError(

    > +                _('cannot decode bookmark: expected size: %d, '

    > +                'actual size: %d') % (bookmarksize, len(bookmark)))

    > +        bookmark = encoding.tolocal(bookmark)

    > +        packedemptynodeflag = stream.read(struct.calcsize('?'))

    > +        emptynode = struct.unpack('?', packedemptynodeflag)[0]

    > +        node = ''

    > +        if not emptynode:

    > +            node = stream.read(20)

    > +        bookmarks[bookmark] = node

    > +        bookmarksize = _unpackbookmarksize(stream)

    > +    return bookmarks

    > +

    >  def _getbkfile(repo):

    >      """Hook so that extensions that mess with the store can hook bm storage.

    >

    > _______________________________________________

    > Mercurial-devel mailing list

    > Mercurial-devel@mercurial-scm.org

    > https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mercurial-2Dscm.org_mailman_listinfo_mercurial-2Ddevel&d=DQIBAg&c=5VD0RTtNlTh3ycd41b3MUw&r=1EQ58Dmb5uX1qHujcsT1Mg&m=XfOL5bchNzFkHn3jvJndGgHchtnR08c9-WuY56aaUhw&s=xzxL1iYhrBKwmRk52RxWVbj-I3f4dfTiKl4dd9tjkOQ&e=
Augie Fackler - Nov. 10, 2016, 3:54 p.m.
On Thu, Nov 10, 2016 at 09:24:56AM +0000, Stanislau Hlebik wrote:
> We can use 32 bytes, but only 20 bytes will be used.
> What’s the point then?

Because some day sha3 or sha256 will be used.

>
> On 11/9/16, 5:22 PM, "Augie Fackler" <raf@durin42.com> wrote:
>
>     On Wed, Nov 02, 2016 at 07:06:44AM -0700, Stanislau Hlebik wrote:
>     > # HG changeset patch
>     > # User Stanislau Hlebik <stash@fb.com>
>     > # Date 1478016027 25200
>     > #      Tue Nov 01 09:00:27 2016 -0700
>     > # Branch stable
>     > # Node ID f3da841e3d47ccbb0be3892b521607c09fdeab13
>     > # Parent  a56a624a8a42b93ec980d2c284756a38719dffe6
>     > bookmarks: introduce binary encoding
>     >
>     > Bookmarks binary encoding will be used for `bookmarks` bundle2 part.
>     > The format is: <4 bytes - bookmark size, big-endian><bookmark>
>     >                <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
>
>     Enough other formats have 32 bytes of space for nodes (eg revlog) that
>     I suspect we should do 32 bytes here too. Thoughts?
>
>     > BookmarksEncodeError and BookmarksDecodeError maybe thrown if input is
>     > incorrect.
>     >
>     > diff --git a/mercurial/bookmarks.py b/mercurial/bookmarks.py
>     > --- a/mercurial/bookmarks.py
>     > +++ b/mercurial/bookmarks.py
>     > @@ -7,8 +7,10 @@
>     >
>     >  from __future__ import absolute_import
>     >
>     > +import StringIO
>     >  import errno
>     >  import os
>     > +import struct
>     >
>     >  from .i18n import _
>     >  from .node import (
>     > @@ -23,6 +25,70 @@
>     >      util,
>     >  )
>     >
>     > +_NONEMPTYNODE = struct.pack('?', False)
>     > +_EMPTYNODE = struct.pack('?', True)
>     > +
>     > +def _unpackbookmarksize(stream):
>     > +    """Returns 0 if stream is empty.
>     > +    """
>     > +
>     > +    expectedsize = struct.calcsize('>i')
>     > +    encodedbookmarksize = stream.read(expectedsize)
>     > +    if len(encodedbookmarksize) == 0:
>     > +        return 0
>     > +    if len(encodedbookmarksize) != expectedsize:
>     > +        raise ValueError(
>     > +            _('cannot decode bookmark size: '
>     > +              'expected size: %d, actual size: %d') %
>     > +            (expectedsize, len(encodedbookmarksize)))
>     > +    return struct.unpack('>i', encodedbookmarksize)[0]
>     > +
>     > +def encodebookmarks(bookmarks):
>     > +    """Encodes bookmark to node mapping to the byte string.
>     > +
>     > +    Format: <4 bytes - bookmark size><bookmark>
>     > +            <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
>     > +    Node may be 20 byte binary string, 40 byte hex string or empty
>     > +    """
>     > +
>     > +    for bookmark, node in bookmarks.iteritems():
>     > +        yield struct.pack('>i', (len(bookmark)))
>     > +        yield encoding.fromlocal(bookmark)
>     > +        if node:
>     > +            if len(node) != 20 and len(node) != 40:
>     > +                raise ValueError(_('node must be 20 or bytes long'))
>     > +            if len(node) == 40:
>     > +                node = bin(node)
>     > +            yield _NONEMPTYNODE
>     > +            yield node
>     > +        else:
>     > +            yield _EMPTYNODE
>     > +
>     > +def decodebookmarks(buf):
>     > +    """Decodes buffer into bookmark to node mapping.
>     > +
>     > +    Node is either 20 bytes or empty.
>     > +    """
>     > +
>     > +    stream = StringIO.StringIO(buf)
>     > +    bookmarks = {}
>     > +    bookmarksize = _unpackbookmarksize(stream)
>     > +    while bookmarksize:
>     > +        bookmark = stream.read(bookmarksize)
>     > +        if len(bookmark) != bookmarksize:
>     > +            raise ValueError(
>     > +                _('cannot decode bookmark: expected size: %d, '
>     > +                'actual size: %d') % (bookmarksize, len(bookmark)))
>     > +        bookmark = encoding.tolocal(bookmark)
>     > +        packedemptynodeflag = stream.read(struct.calcsize('?'))
>     > +        emptynode = struct.unpack('?', packedemptynodeflag)[0]
>     > +        node = ''
>     > +        if not emptynode:
>     > +            node = stream.read(20)
>     > +        bookmarks[bookmark] = node
>     > +        bookmarksize = _unpackbookmarksize(stream)
>     > +    return bookmarks
>     > +
>     >  def _getbkfile(repo):
>     >      """Hook so that extensions that mess with the store can hook bm storage.
>     >
>     > _______________________________________________
>     > Mercurial-devel mailing list
>     > Mercurial-devel@mercurial-scm.org
>     > https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mercurial-2Dscm.org_mailman_listinfo_mercurial-2Ddevel&d=DQIBAg&c=5VD0RTtNlTh3ycd41b3MUw&r=1EQ58Dmb5uX1qHujcsT1Mg&m=XfOL5bchNzFkHn3jvJndGgHchtnR08c9-WuY56aaUhw&s=xzxL1iYhrBKwmRk52RxWVbj-I3f4dfTiKl4dd9tjkOQ&e=
>
>
Pierre-Yves David - Nov. 10, 2016, 5:24 p.m.
On 11/10/2016 03:54 PM, Augie Fackler wrote:
> On Thu, Nov 10, 2016 at 09:24:56AM +0000, Stanislau Hlebik wrote:
>> We can use 32 bytes, but only 20 bytes will be used.
>> What’s the point then?
>
> Because some day sha3 or sha256 will be used.

bundle2 part have parameter, we can use a one of them to inform the 
handler of the hash type when it change in the future.

[not I have not looked at the code of this series yet]

>> On 11/9/16, 5:22 PM, "Augie Fackler" <raf@durin42.com> wrote:
>>
>>     On Wed, Nov 02, 2016 at 07:06:44AM -0700, Stanislau Hlebik wrote:
>>     > # HG changeset patch
>>     > # User Stanislau Hlebik <stash@fb.com>
>>     > # Date 1478016027 25200
>>     > #      Tue Nov 01 09:00:27 2016 -0700
>>     > # Branch stable
>>     > # Node ID f3da841e3d47ccbb0be3892b521607c09fdeab13
>>     > # Parent  a56a624a8a42b93ec980d2c284756a38719dffe6
>>     > bookmarks: introduce binary encoding
>>     >
>>     > Bookmarks binary encoding will be used for `bookmarks` bundle2 part.
>>     > The format is: <4 bytes - bookmark size, big-endian><bookmark>
>>     >                <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
>>
>>     Enough other formats have 32 bytes of space for nodes (eg revlog) that
>>     I suspect we should do 32 bytes here too. Thoughts?
>>
>>     > BookmarksEncodeError and BookmarksDecodeError maybe thrown if input is
>>     > incorrect.
>>     >
>>     > diff --git a/mercurial/bookmarks.py b/mercurial/bookmarks.py
>>     > --- a/mercurial/bookmarks.py
>>     > +++ b/mercurial/bookmarks.py
>>     > @@ -7,8 +7,10 @@
>>     >
>>     >  from __future__ import absolute_import
>>     >
>>     > +import StringIO
>>     >  import errno
>>     >  import os
>>     > +import struct
>>     >
>>     >  from .i18n import _
>>     >  from .node import (
>>     > @@ -23,6 +25,70 @@
>>     >      util,
>>     >  )
>>     >
>>     > +_NONEMPTYNODE = struct.pack('?', False)
>>     > +_EMPTYNODE = struct.pack('?', True)
>>     > +
>>     > +def _unpackbookmarksize(stream):
>>     > +    """Returns 0 if stream is empty.
>>     > +    """
>>     > +
>>     > +    expectedsize = struct.calcsize('>i')
>>     > +    encodedbookmarksize = stream.read(expectedsize)
>>     > +    if len(encodedbookmarksize) == 0:
>>     > +        return 0
>>     > +    if len(encodedbookmarksize) != expectedsize:
>>     > +        raise ValueError(
>>     > +            _('cannot decode bookmark size: '
>>     > +              'expected size: %d, actual size: %d') %
>>     > +            (expectedsize, len(encodedbookmarksize)))
>>     > +    return struct.unpack('>i', encodedbookmarksize)[0]
>>     > +
>>     > +def encodebookmarks(bookmarks):
>>     > +    """Encodes bookmark to node mapping to the byte string.
>>     > +
>>     > +    Format: <4 bytes - bookmark size><bookmark>
>>     > +            <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
>>     > +    Node may be 20 byte binary string, 40 byte hex string or empty
>>     > +    """
>>     > +
>>     > +    for bookmark, node in bookmarks.iteritems():
>>     > +        yield struct.pack('>i', (len(bookmark)))
>>     > +        yield encoding.fromlocal(bookmark)
>>     > +        if node:
>>     > +            if len(node) != 20 and len(node) != 40:
>>     > +                raise ValueError(_('node must be 20 or bytes long'))
>>     > +            if len(node) == 40:
>>     > +                node = bin(node)
>>     > +            yield _NONEMPTYNODE
>>     > +            yield node
>>     > +        else:
>>     > +            yield _EMPTYNODE
>>     > +
>>     > +def decodebookmarks(buf):
>>     > +    """Decodes buffer into bookmark to node mapping.
>>     > +
>>     > +    Node is either 20 bytes or empty.
>>     > +    """
>>     > +
>>     > +    stream = StringIO.StringIO(buf)
>>     > +    bookmarks = {}
>>     > +    bookmarksize = _unpackbookmarksize(stream)
>>     > +    while bookmarksize:
>>     > +        bookmark = stream.read(bookmarksize)
>>     > +        if len(bookmark) != bookmarksize:
>>     > +            raise ValueError(
>>     > +                _('cannot decode bookmark: expected size: %d, '
>>     > +                'actual size: %d') % (bookmarksize, len(bookmark)))
>>     > +        bookmark = encoding.tolocal(bookmark)
>>     > +        packedemptynodeflag = stream.read(struct.calcsize('?'))
>>     > +        emptynode = struct.unpack('?', packedemptynodeflag)[0]
>>     > +        node = ''
>>     > +        if not emptynode:
>>     > +            node = stream.read(20)
>>     > +        bookmarks[bookmark] = node
>>     > +        bookmarksize = _unpackbookmarksize(stream)
>>     > +    return bookmarks
>>     > +
>>     >  def _getbkfile(repo):
>>     >      """Hook so that extensions that mess with the store can hook bm storage.
Gregory Szorc - Nov. 11, 2016, 4:59 a.m.
On Wed, Nov 9, 2016 at 9:22 AM, Augie Fackler <raf@durin42.com> wrote:

> On Wed, Nov 02, 2016 at 07:06:44AM -0700, Stanislau Hlebik wrote:
> > # HG changeset patch
> > # User Stanislau Hlebik <stash@fb.com>
> > # Date 1478016027 25200
> > #      Tue Nov 01 09:00:27 2016 -0700
> > # Branch stable
> > # Node ID f3da841e3d47ccbb0be3892b521607c09fdeab13
> > # Parent  a56a624a8a42b93ec980d2c284756a38719dffe6
> > bookmarks: introduce binary encoding
> >
> > Bookmarks binary encoding will be used for `bookmarks` bundle2 part.
> > The format is: <4 bytes - bookmark size, big-endian><bookmark>
> >                <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
>
> Enough other formats have 32 bytes of space for nodes (eg revlog) that
> I suspect we should do 32 bytes here too. Thoughts?
>

The wire protocol today nearly universally assumes a binary node is 20
bytes. I want 32 byte hashes as bad as the next person. But until we have
actual patches working towards that goal, I'm inclined to call YAGNI and
say 20 bytes is acceptable. As Pierre-Yves says, we can use a bundle part
parameter (or introduce a new bundle part) to handle >20 bytes nodes when
needed.


>
> > BookmarksEncodeError and BookmarksDecodeError maybe thrown if input is
> > incorrect.
> >
> > diff --git a/mercurial/bookmarks.py b/mercurial/bookmarks.py
> > --- a/mercurial/bookmarks.py
> > +++ b/mercurial/bookmarks.py
> > @@ -7,8 +7,10 @@
> >
> >  from __future__ import absolute_import
> >
> > +import StringIO
> >  import errno
> >  import os
> > +import struct
> >
> >  from .i18n import _
> >  from .node import (
> > @@ -23,6 +25,70 @@
> >      util,
> >  )
> >
> > +_NONEMPTYNODE = struct.pack('?', False)
> > +_EMPTYNODE = struct.pack('?', True)
> > +
> > +def _unpackbookmarksize(stream):
> > +    """Returns 0 if stream is empty.
> > +    """
> > +
> > +    expectedsize = struct.calcsize('>i')
> > +    encodedbookmarksize = stream.read(expectedsize)
> > +    if len(encodedbookmarksize) == 0:
> > +        return 0
> > +    if len(encodedbookmarksize) != expectedsize:
> > +        raise ValueError(
> > +            _('cannot decode bookmark size: '
> > +              'expected size: %d, actual size: %d') %
> > +            (expectedsize, len(encodedbookmarksize)))
> > +    return struct.unpack('>i', encodedbookmarksize)[0]
> > +
> > +def encodebookmarks(bookmarks):
> > +    """Encodes bookmark to node mapping to the byte string.
> > +
> > +    Format: <4 bytes - bookmark size><bookmark>
> > +            <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
> > +    Node may be 20 byte binary string, 40 byte hex string or empty
> > +    """
> > +
> > +    for bookmark, node in bookmarks.iteritems():
> > +        yield struct.pack('>i', (len(bookmark)))
> > +        yield encoding.fromlocal(bookmark)
> > +        if node:
> > +            if len(node) != 20 and len(node) != 40:
> > +                raise ValueError(_('node must be 20 or bytes long'))
> > +            if len(node) == 40:
> > +                node = bin(node)
> > +            yield _NONEMPTYNODE
> > +            yield node
> > +        else:
> > +            yield _EMPTYNODE
> > +
> > +def decodebookmarks(buf):
> > +    """Decodes buffer into bookmark to node mapping.
> > +
> > +    Node is either 20 bytes or empty.
> > +    """
> > +
> > +    stream = StringIO.StringIO(buf)
> > +    bookmarks = {}
> > +    bookmarksize = _unpackbookmarksize(stream)
> > +    while bookmarksize:
> > +        bookmark = stream.read(bookmarksize)
> > +        if len(bookmark) != bookmarksize:
> > +            raise ValueError(
> > +                _('cannot decode bookmark: expected size: %d, '
> > +                'actual size: %d') % (bookmarksize, len(bookmark)))
> > +        bookmark = encoding.tolocal(bookmark)
> > +        packedemptynodeflag = stream.read(struct.calcsize('?'))
> > +        emptynode = struct.unpack('?', packedemptynodeflag)[0]
> > +        node = ''
> > +        if not emptynode:
> > +            node = stream.read(20)
> > +        bookmarks[bookmark] = node
> > +        bookmarksize = _unpackbookmarksize(stream)
> > +    return bookmarks
> > +
> >  def _getbkfile(repo):
> >      """Hook so that extensions that mess with the store can hook bm
> storage.
> >
> > _______________________________________________
> > Mercurial-devel mailing list
> > Mercurial-devel@mercurial-scm.org
> > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>
Gregory Szorc - Nov. 11, 2016, 5:20 a.m.
On Wed, Nov 2, 2016 at 7:06 AM, Stanislau Hlebik <stash@fb.com> wrote:

> # HG changeset patch
> # User Stanislau Hlebik <stash@fb.com>
> # Date 1478016027 25200
> #      Tue Nov 01 09:00:27 2016 -0700
> # Branch stable
> # Node ID f3da841e3d47ccbb0be3892b521607c09fdeab13
> # Parent  a56a624a8a42b93ec980d2c284756a38719dffe6
> bookmarks: introduce binary encoding
>
> Bookmarks binary encoding will be used for `bookmarks` bundle2 part.
> The format is: <4 bytes - bookmark size, big-endian><bookmark>
>                <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
> BookmarksEncodeError and BookmarksDecodeError maybe thrown if input is
> incorrect.
>
> diff --git a/mercurial/bookmarks.py b/mercurial/bookmarks.py
> --- a/mercurial/bookmarks.py
> +++ b/mercurial/bookmarks.py
> @@ -7,8 +7,10 @@
>
>  from __future__ import absolute_import
>
> +import StringIO
>

This module isn't available on Python 3. Use util.stringio or (preferably)
io.BytesIO (since it assumes it is operating on bytes and raises if it sees
unicode types).


>  import errno
>  import os
> +import struct
>
>  from .i18n import _
>  from .node import (
> @@ -23,6 +25,70 @@
>      util,
>  )
>
> +_NONEMPTYNODE = struct.pack('?', False)
> +_EMPTYNODE = struct.pack('?', True)
>

TIL "?" is a valid format character for struct!


> +
> +def _unpackbookmarksize(stream):
> +    """Returns 0 if stream is empty.
> +    """
> +
> +    expectedsize = struct.calcsize('>i')
> +    encodedbookmarksize = stream.read(expectedsize)
> +    if len(encodedbookmarksize) == 0:
> +        return 0
>

if not encodedbookmarksize:


> +    if len(encodedbookmarksize) != expectedsize:
> +        raise ValueError(
> +            _('cannot decode bookmark size: '
> +              'expected size: %d, actual size: %d') %
> +            (expectedsize, len(encodedbookmarksize)))
> +    return struct.unpack('>i', encodedbookmarksize)[0]
> +
> +def encodebookmarks(bookmarks):
> +    """Encodes bookmark to node mapping to the byte string.
> +
> +    Format: <4 bytes - bookmark size><bookmark>
> +            <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
> +    Node may be 20 byte binary string, 40 byte hex string or empty
> +    """
>

I could make the argument for grouping the data by all sizes+nodes then all
the names. This allows nice byte aligned packing and unpacking and some
opportunities for indexing. It may also yield slightly better compression
as the bookmark names will all be next to each other (although you'd have
to measure that because compression window sizes may save us). But I'm
having difficulty convincing myself it is worth it. Of course, having this
data layout would also prohibit streaming. I think it is worth mentioning
the alternative but probably not worth switching.


> +
> +    for bookmark, node in bookmarks.iteritems():
>

You'll want to use a sorted() here or eventual tests for multiple bookmarks
will be unstable.


> +        yield struct.pack('>i', (len(bookmark)))
> +        yield encoding.fromlocal(bookmark)
> +        if node:
> +            if len(node) != 20 and len(node) != 40:
> +                raise ValueError(_('node must be 20 or bytes long'))
> +            if len(node) == 40:
> +                node = bin(node)
>

I think the API should accept a single node type: the 20 byte binary node.


> +            yield _NONEMPTYNODE
> +            yield node
> +        else:
> +            yield _EMPTYNODE
>

I'm on the fence regarding this single byte flag. On one hand, being
explicit is nice. On the other, a sequence of \x00 identifies the null or
empty node in so many other places. What's your reasoning here?


> +
> +def decodebookmarks(buf):
> +    """Decodes buffer into bookmark to node mapping.
> +
> +    Node is either 20 bytes or empty.
> +    """
> +
> +    stream = StringIO.StringIO(buf)
> +    bookmarks = {}
> +    bookmarksize = _unpackbookmarksize(stream)
> +    while bookmarksize:
> +        bookmark = stream.read(bookmarksize)
> +        if len(bookmark) != bookmarksize:
> +            raise ValueError(
> +                _('cannot decode bookmark: expected size: %d, '
> +                'actual size: %d') % (bookmarksize, len(bookmark)))
> +        bookmark = encoding.tolocal(bookmark)
> +        packedemptynodeflag = stream.read(struct.calcsize('?'))
> +        emptynode = struct.unpack('?', packedemptynodeflag)[0]
>

Yes, `struct` does cache things. But I don't like relying on it inside a
loop. Please create a `struct.Struct` instance outside the loop.


> +        node = ''
> +        if not emptynode:
> +            node = stream.read(20)
> +        bookmarks[bookmark] = node
>

Alternatively, we could yield key, value pairs. That would facilitate
streaming and allow the part to adapt to storing multiple values for the
same bookmark. (YAGNI applies so if multiple nodes per bookmark isn't
something you think we would need in the immediate future, don't bother.)
New bundle parts aren't that hard to invent.


> +        bookmarksize = _unpackbookmarksize(stream)
> +    return bookmarks
> +
>  def _getbkfile(repo):
>      """Hook so that extensions that mess with the store can hook bm
> storage.
>
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>

Patch

diff --git a/mercurial/bookmarks.py b/mercurial/bookmarks.py
--- a/mercurial/bookmarks.py
+++ b/mercurial/bookmarks.py
@@ -7,8 +7,10 @@ 
 
 from __future__ import absolute_import
 
+import StringIO
 import errno
 import os
+import struct
 
 from .i18n import _
 from .node import (
@@ -23,6 +25,70 @@ 
     util,
 )
 
+_NONEMPTYNODE = struct.pack('?', False)
+_EMPTYNODE = struct.pack('?', True)
+
+def _unpackbookmarksize(stream):
+    """Returns 0 if stream is empty.
+    """
+
+    expectedsize = struct.calcsize('>i')
+    encodedbookmarksize = stream.read(expectedsize)
+    if len(encodedbookmarksize) == 0:
+        return 0
+    if len(encodedbookmarksize) != expectedsize:
+        raise ValueError(
+            _('cannot decode bookmark size: '
+              'expected size: %d, actual size: %d') %
+            (expectedsize, len(encodedbookmarksize)))
+    return struct.unpack('>i', encodedbookmarksize)[0]
+
+def encodebookmarks(bookmarks):
+    """Encodes bookmark to node mapping to the byte string.
+
+    Format: <4 bytes - bookmark size><bookmark>
+            <1 byte - 0 if node is empty 1 otherwise><20 bytes node>
+    Node may be 20 byte binary string, 40 byte hex string or empty
+    """
+
+    for bookmark, node in bookmarks.iteritems():
+        yield struct.pack('>i', (len(bookmark)))
+        yield encoding.fromlocal(bookmark)
+        if node:
+            if len(node) != 20 and len(node) != 40:
+                raise ValueError(_('node must be 20 or bytes long'))
+            if len(node) == 40:
+                node = bin(node)
+            yield _NONEMPTYNODE
+            yield node
+        else:
+            yield _EMPTYNODE
+
+def decodebookmarks(buf):
+    """Decodes buffer into bookmark to node mapping.
+
+    Node is either 20 bytes or empty.
+    """
+
+    stream = StringIO.StringIO(buf)
+    bookmarks = {}
+    bookmarksize = _unpackbookmarksize(stream)
+    while bookmarksize:
+        bookmark = stream.read(bookmarksize)
+        if len(bookmark) != bookmarksize:
+            raise ValueError(
+                _('cannot decode bookmark: expected size: %d, '
+                'actual size: %d') % (bookmarksize, len(bookmark)))
+        bookmark = encoding.tolocal(bookmark)
+        packedemptynodeflag = stream.read(struct.calcsize('?'))
+        emptynode = struct.unpack('?', packedemptynodeflag)[0]
+        node = ''
+        if not emptynode:
+            node = stream.read(20)
+        bookmarks[bookmark] = node
+        bookmarksize = _unpackbookmarksize(stream)
+    return bookmarks
+
 def _getbkfile(repo):
     """Hook so that extensions that mess with the store can hook bm storage.