Patchwork D2948: wireproto: syntax for encoding CBOR into frames

login
register
mail settings
Submitter phabricator
Date March 28, 2018, 10:06 p.m.
Message ID <0b6e88b603f20dd4249dbd20dd716850@localhost.localdomain>
Download mbox | patch
Permalink /patch/29923/
State Not Applicable
Headers show

Comments

phabricator - March 28, 2018, 10:06 p.m.
indygreg updated this revision to Diff 7352.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D2948?vs=7327&id=7352

REVISION DETAIL
  https://phab.mercurial-scm.org/D2948

AFFECTED FILES
  mercurial/debugcommands.py
  mercurial/utils/stringutil.py
  mercurial/wireprotoframing.py
  tests/test-wireproto-serverreactor.py

CHANGE DETAILS




To: indygreg, #hg-reviewers
Cc: mercurial-devel
Josef 'Jeff' Sipek - March 29, 2018, 4:36 p.m.
On Wed, Mar 28, 2018 at 22:06:17 +0000, indygreg (Gregory Szorc) wrote:
> indygreg updated this revision to Diff 7352.
> 
> REPOSITORY
>   rHG Mercurial
> 
> CHANGES SINCE LAST UPDATE
>   https://phab.mercurial-scm.org/D2948?vs=7327&id=7352
> 
> REVISION DETAIL
>   https://phab.mercurial-scm.org/D2948
> 
> AFFECTED FILES
>   mercurial/debugcommands.py
>   mercurial/utils/stringutil.py
>   mercurial/wireprotoframing.py
>   tests/test-wireproto-serverreactor.py
> 
> CHANGE DETAILS
> 
...
> diff --git a/mercurial/wireprotoframing.py b/mercurial/wireprotoframing.py
> --- a/mercurial/wireprotoframing.py
> +++ b/mercurial/wireprotoframing.py
> @@ -16,6 +16,7 @@
>  from .i18n import _
>  from .thirdparty import (
>      attr,
> +    cbor,
>  )
>  from . import (
>      error,
> @@ -156,6 +157,9 @@
>  def makeframefromhumanstring(s):
>      """Create a frame from a human readable string
>  
> +    DANGER: NOT SAFE TO USE WITH UNTRUSTED INPUT BECAUSE OF POTENTIAL
> +    eval() USAGE. DO NOT USE IN CORE.
> +
>      Strings have the form:
>  
>          <request-id> <stream-id> <stream-flags> <type> <flags> <payload>
> @@ -169,6 +173,11 @@
>      named constant.
>  
>      Flags can be delimited by `|` to bitwise OR them together.
> +
> +    If the payload begins with ``cbor:``, the following string will be
> +    evaluated as Python code and the resulting object will be fed into
> +    a CBOR encoder. Otherwise, the payload is interpreted as a Python
> +    byte string literal.
>      """

Um, why?  Why not just *always* wrap byte strings in CBOR?  The overhead
will be 1-9 bytes depending on the length (short strings will use 1-2 bytes
of overhead), then there's no need to prefix anything with "cbor:".  Any
string <4GB in size will incur 5 bytes overhead.  Which is (IMO) acceptable.

Jeff.

>      fields = s.split(b' ', 5)
>      requestid, streamid, streamflags, frametype, frameflags, payload = fields
> @@ -196,7 +205,11 @@
>          else:
>              finalflags |= int(flag)
>  
> -    payload = stringutil.unescapestr(payload)
> +    if payload.startswith(b'cbor:'):
> +        payload = cbor.dumps(stringutil.evalpython(payload[5:]), canonical=True)
> +
> +    else:
> +        payload = stringutil.unescapestr(payload)
>  
>      return makeframe(requestid=requestid, streamid=streamid,
>                       streamflags=finalstreamflags, typeid=frametype,
> diff --git a/mercurial/utils/stringutil.py b/mercurial/utils/stringutil.py
> --- a/mercurial/utils/stringutil.py
> +++ b/mercurial/utils/stringutil.py
> @@ -9,6 +9,7 @@
>  
>  from __future__ import absolute_import
>  
> +import __future__
>  import codecs
>  import re as remod
>  import textwrap
> @@ -286,3 +287,29 @@
>      If s is not a valid boolean, returns None.
>      """
>      return _booleans.get(s.lower(), None)
> +
> +def evalpython(s):
> +    """Evaluate a string containing a Python expression.
> +
> +    THIS FUNCTION IS NOT SAFE TO USE ON UNTRUSTED INPUT. IT'S USE SHOULD BE
> +    LIMITED TO DEVELOPER-FACING FUNCTIONALITY.
> +    """
> +    globs = {
> +        r'__builtins__': {
> +            r'None': None,
> +            r'False': False,
> +            r'True': True,
> +            r'int': int,
> +            r'set': set,
> +            r'tuple': tuple,
> +            # Don't need to expose dict and list because we can use
> +            # literals.
> +        },
> +    }
> +
> +    # We can't use eval() directly because it inherits compiler
> +    # flags from this module and we need unicode literals for Python 3
> +    # compatibility.
> +    code = compile(s, r'<string>', r'eval',
> +                   __future__.unicode_literals.compiler_flag, True)
> +    return eval(code, globs, {})
> diff --git a/mercurial/debugcommands.py b/mercurial/debugcommands.py
> --- a/mercurial/debugcommands.py
> +++ b/mercurial/debugcommands.py
> @@ -2785,7 +2785,10 @@
>      or a flag name for stream flags or frame flags, respectively. Values are
>      resolved to integers and then bitwise OR'd together.
>  
> -    ``payload`` is is evaluated as a Python byte string literal.
> +    ``payload`` represents the raw frame payload. If it begins with
> +    ``cbor:``, the following string is evaluated as Python code and the
> +    resulting object is fed into a CBOR encoder. Otherwise it is interpreted
> +    as a Python byte string literal.
>      """
>      opts = pycompat.byteskwargs(opts)
>  
> 
> 
> 
> To: indygreg, #hg-reviewers
> Cc: mercurial-devel
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Gregory Szorc - March 29, 2018, 4:54 p.m.
On Thu, Mar 29, 2018 at 9:36 AM, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
wrote:

> On Wed, Mar 28, 2018 at 22:06:17 +0000, indygreg (Gregory Szorc) wrote:
> > indygreg updated this revision to Diff 7352.
> >
> > REPOSITORY
> >   rHG Mercurial
> >
> > CHANGES SINCE LAST UPDATE
> >   https://phab.mercurial-scm.org/D2948?vs=7327&id=7352
> >
> > REVISION DETAIL
> >   https://phab.mercurial-scm.org/D2948
> >
> > AFFECTED FILES
> >   mercurial/debugcommands.py
> >   mercurial/utils/stringutil.py
> >   mercurial/wireprotoframing.py
> >   tests/test-wireproto-serverreactor.py
> >
> > CHANGE DETAILS
> >
> ...
> > diff --git a/mercurial/wireprotoframing.py b/mercurial/wireprotoframing.
> py
> > --- a/mercurial/wireprotoframing.py
> > +++ b/mercurial/wireprotoframing.py
> > @@ -16,6 +16,7 @@
> >  from .i18n import _
> >  from .thirdparty import (
> >      attr,
> > +    cbor,
> >  )
> >  from . import (
> >      error,
> > @@ -156,6 +157,9 @@
> >  def makeframefromhumanstring(s):
> >      """Create a frame from a human readable string
> >
> > +    DANGER: NOT SAFE TO USE WITH UNTRUSTED INPUT BECAUSE OF POTENTIAL
> > +    eval() USAGE. DO NOT USE IN CORE.
> > +
> >      Strings have the form:
> >
> >          <request-id> <stream-id> <stream-flags> <type> <flags> <payload>
> > @@ -169,6 +173,11 @@
> >      named constant.
> >
> >      Flags can be delimited by `|` to bitwise OR them together.
> > +
> > +    If the payload begins with ``cbor:``, the following string will be
> > +    evaluated as Python code and the resulting object will be fed into
> > +    a CBOR encoder. Otherwise, the payload is interpreted as a Python
> > +    byte string literal.
> >      """
>
> Um, why?  Why not just *always* wrap byte strings in CBOR?  The overhead
> will be 1-9 bytes depending on the length (short strings will use 1-2 bytes
> of overhead), then there's no need to prefix anything with "cbor:".  Any
> string <4GB in size will incur 5 bytes overhead.  Which is (IMO)
> acceptable.
>

We may end up in that state. There was code emitting non-CBOR data and I
didn't want to scope bloat the commit.


>
> >      fields = s.split(b' ', 5)
> >      requestid, streamid, streamflags, frametype, frameflags, payload =
> fields
> > @@ -196,7 +205,11 @@
> >          else:
> >              finalflags |= int(flag)
> >
> > -    payload = stringutil.unescapestr(payload)
> > +    if payload.startswith(b'cbor:'):
> > +        payload = cbor.dumps(stringutil.evalpython(payload[5:]),
> canonical=True)
> > +
> > +    else:
> > +        payload = stringutil.unescapestr(payload)
> >
> >      return makeframe(requestid=requestid, streamid=streamid,
> >                       streamflags=finalstreamflags, typeid=frametype,
> > diff --git a/mercurial/utils/stringutil.py b/mercurial/utils/stringutil.
> py
> > --- a/mercurial/utils/stringutil.py
> > +++ b/mercurial/utils/stringutil.py
> > @@ -9,6 +9,7 @@
> >
> >  from __future__ import absolute_import
> >
> > +import __future__
> >  import codecs
> >  import re as remod
> >  import textwrap
> > @@ -286,3 +287,29 @@
> >      If s is not a valid boolean, returns None.
> >      """
> >      return _booleans.get(s.lower(), None)
> > +
> > +def evalpython(s):
> > +    """Evaluate a string containing a Python expression.
> > +
> > +    THIS FUNCTION IS NOT SAFE TO USE ON UNTRUSTED INPUT. IT'S USE
> SHOULD BE
> > +    LIMITED TO DEVELOPER-FACING FUNCTIONALITY.
> > +    """
> > +    globs = {
> > +        r'__builtins__': {
> > +            r'None': None,
> > +            r'False': False,
> > +            r'True': True,
> > +            r'int': int,
> > +            r'set': set,
> > +            r'tuple': tuple,
> > +            # Don't need to expose dict and list because we can use
> > +            # literals.
> > +        },
> > +    }
> > +
> > +    # We can't use eval() directly because it inherits compiler
> > +    # flags from this module and we need unicode literals for Python 3
> > +    # compatibility.
> > +    code = compile(s, r'<string>', r'eval',
> > +                   __future__.unicode_literals.compiler_flag, True)
> > +    return eval(code, globs, {})
> > diff --git a/mercurial/debugcommands.py b/mercurial/debugcommands.py
> > --- a/mercurial/debugcommands.py
> > +++ b/mercurial/debugcommands.py
> > @@ -2785,7 +2785,10 @@
> >      or a flag name for stream flags or frame flags, respectively.
> Values are
> >      resolved to integers and then bitwise OR'd together.
> >
> > -    ``payload`` is is evaluated as a Python byte string literal.
> > +    ``payload`` represents the raw frame payload. If it begins with
> > +    ``cbor:``, the following string is evaluated as Python code and the
> > +    resulting object is fed into a CBOR encoder. Otherwise it is
> interpreted
> > +    as a Python byte string literal.
> >      """
> >      opts = pycompat.byteskwargs(opts)
> >
> >
> >
> >
> > To: indygreg, #hg-reviewers
> > Cc: mercurial-devel
> > _______________________________________________
> > Mercurial-devel mailing list
> > Mercurial-devel@mercurial-scm.org
> > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>
> --
> Si hoc legere scis nimium eruditionis habes.
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>

Patch

diff --git a/tests/test-wireproto-serverreactor.py b/tests/test-wireproto-serverreactor.py
--- a/tests/test-wireproto-serverreactor.py
+++ b/tests/test-wireproto-serverreactor.py
@@ -35,6 +35,59 @@ 
                       framing.createcommandframes(stream, rid, cmd, args,
                                                   datafh))
 
+class FrameHumanStringTests(unittest.TestCase):
+    def testbasic(self):
+        self.assertEqual(ffs(b'1 1 0 1 0 '),
+                         b'\x00\x00\x00\x01\x00\x01\x00\x10')
+
+        self.assertEqual(ffs(b'2 4 0 1 0 '),
+                         b'\x00\x00\x00\x02\x00\x04\x00\x10')
+
+        self.assertEqual(ffs(b'2 4 0 1 0 foo'),
+                         b'\x03\x00\x00\x02\x00\x04\x00\x10foo')
+
+    def testcborint(self):
+        self.assertEqual(ffs(b'1 1 0 1 0 cbor:15'),
+                         b'\x01\x00\x00\x01\x00\x01\x00\x10\x0f')
+
+        self.assertEqual(ffs(b'1 1 0 1 0 cbor:42'),
+                         b'\x02\x00\x00\x01\x00\x01\x00\x10\x18*')
+
+        self.assertEqual(ffs(b'1 1 0 1 0 cbor:1048576'),
+                         b'\x05\x00\x00\x01\x00\x01\x00\x10\x1a'
+                         b'\x00\x10\x00\x00')
+
+        self.assertEqual(ffs(b'1 1 0 1 0 cbor:0'),
+                         b'\x01\x00\x00\x01\x00\x01\x00\x10\x00')
+
+        self.assertEqual(ffs(b'1 1 0 1 0 cbor:-1'),
+                         b'\x01\x00\x00\x01\x00\x01\x00\x10 ')
+
+        self.assertEqual(ffs(b'1 1 0 1 0 cbor:-342542'),
+                         b'\x05\x00\x00\x01\x00\x01\x00\x10:\x00\x05:\r')
+
+    def testcborstrings(self):
+        # String literals should be unicode.
+        self.assertEqual(ffs(b"1 1 0 1 0 cbor:'foo'"),
+                         b'\x04\x00\x00\x01\x00\x01\x00\x10cfoo')
+
+        self.assertEqual(ffs(b"1 1 0 1 0 cbor:b'foo'"),
+                         b'\x04\x00\x00\x01\x00\x01\x00\x10Cfoo')
+
+        self.assertEqual(ffs(b"1 1 0 1 0 cbor:u'foo'"),
+                         b'\x04\x00\x00\x01\x00\x01\x00\x10cfoo')
+
+    def testcborlists(self):
+        self.assertEqual(ffs(b"1 1 0 1 0 cbor:[None, True, False, 42, b'foo']"),
+                         b'\n\x00\x00\x01\x00\x01\x00\x10\x85\xf6\xf5\xf4'
+                         b'\x18*Cfoo')
+
+    def testcbordicts(self):
+        self.assertEqual(ffs(b"1 1 0 1 0 "
+                             b"cbor:{b'foo': b'val1', b'bar': b'val2'}"),
+                         b'\x13\x00\x00\x01\x00\x01\x00\x10\xa2'
+                         b'CbarDval2CfooDval1')
+
 class FrameTests(unittest.TestCase):
     def testdataexactframesize(self):
         data = util.bytesio(b'x' * framing.DEFAULT_MAX_FRAME_SIZE)
diff --git a/mercurial/wireprotoframing.py b/mercurial/wireprotoframing.py
--- a/mercurial/wireprotoframing.py
+++ b/mercurial/wireprotoframing.py
@@ -16,6 +16,7 @@ 
 from .i18n import _
 from .thirdparty import (
     attr,
+    cbor,
 )
 from . import (
     error,
@@ -156,6 +157,9 @@ 
 def makeframefromhumanstring(s):
     """Create a frame from a human readable string
 
+    DANGER: NOT SAFE TO USE WITH UNTRUSTED INPUT BECAUSE OF POTENTIAL
+    eval() USAGE. DO NOT USE IN CORE.
+
     Strings have the form:
 
         <request-id> <stream-id> <stream-flags> <type> <flags> <payload>
@@ -169,6 +173,11 @@ 
     named constant.
 
     Flags can be delimited by `|` to bitwise OR them together.
+
+    If the payload begins with ``cbor:``, the following string will be
+    evaluated as Python code and the resulting object will be fed into
+    a CBOR encoder. Otherwise, the payload is interpreted as a Python
+    byte string literal.
     """
     fields = s.split(b' ', 5)
     requestid, streamid, streamflags, frametype, frameflags, payload = fields
@@ -196,7 +205,11 @@ 
         else:
             finalflags |= int(flag)
 
-    payload = stringutil.unescapestr(payload)
+    if payload.startswith(b'cbor:'):
+        payload = cbor.dumps(stringutil.evalpython(payload[5:]), canonical=True)
+
+    else:
+        payload = stringutil.unescapestr(payload)
 
     return makeframe(requestid=requestid, streamid=streamid,
                      streamflags=finalstreamflags, typeid=frametype,
diff --git a/mercurial/utils/stringutil.py b/mercurial/utils/stringutil.py
--- a/mercurial/utils/stringutil.py
+++ b/mercurial/utils/stringutil.py
@@ -9,6 +9,7 @@ 
 
 from __future__ import absolute_import
 
+import __future__
 import codecs
 import re as remod
 import textwrap
@@ -286,3 +287,29 @@ 
     If s is not a valid boolean, returns None.
     """
     return _booleans.get(s.lower(), None)
+
+def evalpython(s):
+    """Evaluate a string containing a Python expression.
+
+    THIS FUNCTION IS NOT SAFE TO USE ON UNTRUSTED INPUT. IT'S USE SHOULD BE
+    LIMITED TO DEVELOPER-FACING FUNCTIONALITY.
+    """
+    globs = {
+        r'__builtins__': {
+            r'None': None,
+            r'False': False,
+            r'True': True,
+            r'int': int,
+            r'set': set,
+            r'tuple': tuple,
+            # Don't need to expose dict and list because we can use
+            # literals.
+        },
+    }
+
+    # We can't use eval() directly because it inherits compiler
+    # flags from this module and we need unicode literals for Python 3
+    # compatibility.
+    code = compile(s, r'<string>', r'eval',
+                   __future__.unicode_literals.compiler_flag, True)
+    return eval(code, globs, {})
diff --git a/mercurial/debugcommands.py b/mercurial/debugcommands.py
--- a/mercurial/debugcommands.py
+++ b/mercurial/debugcommands.py
@@ -2785,7 +2785,10 @@ 
     or a flag name for stream flags or frame flags, respectively. Values are
     resolved to integers and then bitwise OR'd together.
 
-    ``payload`` is is evaluated as a Python byte string literal.
+    ``payload`` represents the raw frame payload. If it begins with
+    ``cbor:``, the following string is evaluated as Python code and the
+    resulting object is fed into a CBOR encoder. Otherwise it is interpreted
+    as a Python byte string literal.
     """
     opts = pycompat.byteskwargs(opts)