Patchwork [2,of,4] pycompat: add bytestr wrapper which mostly acts as a Python 2 str

login
register
mail settings
Submitter Yuya Nishihara
Date March 16, 2017, 3:31 p.m.
Message ID <aae1bd4b8a03ffe6cb47.1489678271@mimosa>
Download mbox | patch
Permalink /patch/19394/
State Accepted
Headers show

Comments

Yuya Nishihara - March 16, 2017, 3:31 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1488980906 -32400
#      Wed Mar 08 22:48:26 2017 +0900
# Node ID aae1bd4b8a03ffe6cb47c23da55b2f70ff5607ef
# Parent  f39291c08d86082579b891e844cbd56752b44a59
pycompat: add bytestr wrapper which mostly acts as a Python 2 str

This allows us to handle bytes in mostly the same manner as Python 2 str,
so we can get rid of ugly s[i:i + 1] hacks:

  s = bytestr(s)
  while i < len(s):
      c = s[i]
      ...

This is the simpler version of the previous RFC patch which tried to preserve
the bytestr type if possible. New version simply drops the bytestr wrapping
so we aren't likely to pass a bytestr to a function that expects Python 3
bytes.
via Mercurial-devel - March 16, 2017, 8:47 p.m.
On Thu, Mar 16, 2017 at 8:31 AM, Yuya Nishihara <yuya@tcha.org> wrote:
> # HG changeset patch
> # User Yuya Nishihara <yuya@tcha.org>
> # Date 1488980906 -32400
> #      Wed Mar 08 22:48:26 2017 +0900
> # Node ID aae1bd4b8a03ffe6cb47c23da55b2f70ff5607ef
> # Parent  f39291c08d86082579b891e844cbd56752b44a59
> pycompat: add bytestr wrapper which mostly acts as a Python 2 str
>
> This allows us to handle bytes in mostly the same manner as Python 2 str,
> so we can get rid of ugly s[i:i + 1] hacks:
>
>   s = bytestr(s)
>   while i < len(s):
>       c = s[i]
>       ...
>
> This is the simpler version of the previous RFC patch which tried to preserve
> the bytestr type if possible. New version simply drops the bytestr wrapping
> so we aren't likely to pass a bytestr to a function that expects Python 3
> bytes.
>
> diff --git a/mercurial/pycompat.py b/mercurial/pycompat.py
> --- a/mercurial/pycompat.py
> +++ b/mercurial/pycompat.py
> @@ -76,6 +76,67 @@ if ispy3:
>
>      bytechr = struct.Struct('>B').pack
>
> +    class bytestr(bytes):
> +        """A bytes which mostly acts as a Python 2 str
> +
> +        >>> bytestr(), bytestr(bytearray(b'foo')), bytestr(u'ascii'), bytestr(1)
> +        (b'', b'foo', b'ascii', b'1')
> +        >>> s = bytestr(b'foo')
> +        >>> assert s is bytestr(s)
> +
> +        There's no implicit conversion from non-ascii str as its encoding is
> +        unknown:
> +
> +        >>> bytestr(chr(0x80)) # doctest: +ELLIPSIS
> +        Traceback (most recent call last):
> +          ...
> +        UnicodeEncodeError: ...
> +
> +        Comparison between bytestr and bytes should work:
> +
> +        >>> assert bytestr(b'foo') == b'foo'
> +        >>> assert b'foo' == bytestr(b'foo')
> +        >>> assert b'f' in bytestr(b'foo')
> +        >>> assert bytestr(b'f') in b'foo'
> +
> +        Sliced elements should be bytes, not integer:
> +
> +        >>> s[1], s[:2]
> +        (b'o', b'fo')
> +        >>> list(s), list(reversed(s))
> +        ([b'f', b'o', b'o'], [b'o', b'o', b'f'])
> +
> +        As bytestr type isn't propagated across operations, you need to cast
> +        bytes to bytestr explicitly:
> +
> +        >>> s = bytestr(b'foo').upper()
> +        >>> t = bytestr(s)
> +        >>> s[0], t[0]
> +        (70, b'F')
> +
> +        Be careful to not pass a bytestr object to a function which expects
> +        bytearray-like behavior.
> +
> +        >>> t = bytes(t)  # cast to bytes
> +        >>> assert type(t) is bytes
> +        """
> +
> +        def __new__(cls, s=b''):
> +            if isinstance(s, bytestr):
> +                return s
> +            if not isinstance(s, (bytes, bytearray)):
> +                s = str(s).encode(u'ascii')
> +            return bytes.__new__(cls, s)
> +
> +        def __getitem__(self, key):
> +            s = bytes.__getitem__(self, key)
> +            if not isinstance(s, bytes):
> +                s = bytechr(s)
> +            return s
> +
> +        def __iter__(self):
> +            return iterbytestr(bytes.__iter__(self))
> +
>      def iterbytestr(s):
>          """Iterate bytes as if it were a str object of Python 2"""
>          return map(bytechr, s)
> @@ -146,6 +207,7 @@ else:
>      import cStringIO
>
>      bytechr = chr
> +    bytestr = str

Could this even be 'identity' (which apparently doesn't exist, so
would be 'lambda s: s' or something)? Will str(a) create a copy of the
underlying buffer too. I think I know that strings are immutable, so
it shouldn't have to?


>      iterbytestr = iter
>
>      def sysstr(s):
> diff --git a/tests/test-doctest.py b/tests/test-doctest.py
> --- a/tests/test-doctest.py
> +++ b/tests/test-doctest.py
> @@ -34,6 +34,7 @@ testmod('mercurial.minirst')
>  testmod('mercurial.patch')
>  testmod('mercurial.pathutil')
>  testmod('mercurial.parser')
> +testmod('mercurial.pycompat', py3=True)
>  testmod('mercurial.revsetlang')
>  testmod('mercurial.smartset')
>  testmod('mercurial.store')
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
via Mercurial-devel - March 16, 2017, 8:58 p.m.
On Thu, Mar 16, 2017 at 1:47 PM, Martin von Zweigbergk
<martinvonz@google.com> wrote:
> On Thu, Mar 16, 2017 at 8:31 AM, Yuya Nishihara <yuya@tcha.org> wrote:
>> # HG changeset patch
>> # User Yuya Nishihara <yuya@tcha.org>
>> # Date 1488980906 -32400
>> #      Wed Mar 08 22:48:26 2017 +0900
>> # Node ID aae1bd4b8a03ffe6cb47c23da55b2f70ff5607ef
>> # Parent  f39291c08d86082579b891e844cbd56752b44a59
>> pycompat: add bytestr wrapper which mostly acts as a Python 2 str
>>
>> This allows us to handle bytes in mostly the same manner as Python 2 str,
>> so we can get rid of ugly s[i:i + 1] hacks:
>>
>>   s = bytestr(s)
>>   while i < len(s):
>>       c = s[i]
>>       ...
>>
>> This is the simpler version of the previous RFC patch which tried to preserve
>> the bytestr type if possible. New version simply drops the bytestr wrapping
>> so we aren't likely to pass a bytestr to a function that expects Python 3
>> bytes.
>>
>> diff --git a/mercurial/pycompat.py b/mercurial/pycompat.py
>> --- a/mercurial/pycompat.py
>> +++ b/mercurial/pycompat.py
>> @@ -76,6 +76,67 @@ if ispy3:
>>
>>      bytechr = struct.Struct('>B').pack
>>
>> +    class bytestr(bytes):
>> +        """A bytes which mostly acts as a Python 2 str
>> +
>> +        >>> bytestr(), bytestr(bytearray(b'foo')), bytestr(u'ascii'), bytestr(1)
>> +        (b'', b'foo', b'ascii', b'1')
>> +        >>> s = bytestr(b'foo')
>> +        >>> assert s is bytestr(s)
>> +
>> +        There's no implicit conversion from non-ascii str as its encoding is
>> +        unknown:
>> +
>> +        >>> bytestr(chr(0x80)) # doctest: +ELLIPSIS
>> +        Traceback (most recent call last):
>> +          ...
>> +        UnicodeEncodeError: ...
>> +
>> +        Comparison between bytestr and bytes should work:
>> +
>> +        >>> assert bytestr(b'foo') == b'foo'
>> +        >>> assert b'foo' == bytestr(b'foo')
>> +        >>> assert b'f' in bytestr(b'foo')
>> +        >>> assert bytestr(b'f') in b'foo'
>> +
>> +        Sliced elements should be bytes, not integer:
>> +
>> +        >>> s[1], s[:2]
>> +        (b'o', b'fo')
>> +        >>> list(s), list(reversed(s))
>> +        ([b'f', b'o', b'o'], [b'o', b'o', b'f'])
>> +
>> +        As bytestr type isn't propagated across operations, you need to cast
>> +        bytes to bytestr explicitly:
>> +
>> +        >>> s = bytestr(b'foo').upper()
>> +        >>> t = bytestr(s)
>> +        >>> s[0], t[0]
>> +        (70, b'F')
>> +
>> +        Be careful to not pass a bytestr object to a function which expects
>> +        bytearray-like behavior.
>> +
>> +        >>> t = bytes(t)  # cast to bytes
>> +        >>> assert type(t) is bytes
>> +        """
>> +
>> +        def __new__(cls, s=b''):
>> +            if isinstance(s, bytestr):
>> +                return s
>> +            if not isinstance(s, (bytes, bytearray)):
>> +                s = str(s).encode(u'ascii')
>> +            return bytes.__new__(cls, s)
>> +
>> +        def __getitem__(self, key):
>> +            s = bytes.__getitem__(self, key)
>> +            if not isinstance(s, bytes):
>> +                s = bytechr(s)
>> +            return s
>> +
>> +        def __iter__(self):
>> +            return iterbytestr(bytes.__iter__(self))
>> +
>>      def iterbytestr(s):
>>          """Iterate bytes as if it were a str object of Python 2"""
>>          return map(bytechr, s)
>> @@ -146,6 +207,7 @@ else:
>>      import cStringIO
>>
>>      bytechr = chr
>> +    bytestr = str
>
> Could this even be 'identity' (which apparently doesn't exist, so
> would be 'lambda s: s' or something)? Will str(a) create a copy of the
> underlying buffer too. I think I know that strings are immutable, so
> it shouldn't have to?

The answer seems to be "no, nothing is copied". I could have sworn
that when I first tested it, "str(a) is b" was False, but I must have
been blind. Sorry about the noise.

>
>
>>      iterbytestr = iter
>>
>>      def sysstr(s):
>> diff --git a/tests/test-doctest.py b/tests/test-doctest.py
>> --- a/tests/test-doctest.py
>> +++ b/tests/test-doctest.py
>> @@ -34,6 +34,7 @@ testmod('mercurial.minirst')
>>  testmod('mercurial.patch')
>>  testmod('mercurial.pathutil')
>>  testmod('mercurial.parser')
>> +testmod('mercurial.pycompat', py3=True)
>>  testmod('mercurial.revsetlang')
>>  testmod('mercurial.smartset')
>>  testmod('mercurial.store')
>> _______________________________________________
>> Mercurial-devel mailing list
>> Mercurial-devel@mercurial-scm.org
>> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Yuya Nishihara - March 17, 2017, 12:52 p.m.
On Thu, 16 Mar 2017 13:58:20 -0700, Martin von Zweigbergk wrote:
> On Thu, Mar 16, 2017 at 1:47 PM, Martin von Zweigbergk
> >>
> >>      bytechr = chr
> >> +    bytestr = str
> >
> > Could this even be 'identity' (which apparently doesn't exist, so
> > would be 'lambda s: s' or something)? Will str(a) create a copy of the
> > underlying buffer too. I think I know that strings are immutable, so
> > it shouldn't have to?
> 
> The answer seems to be "no, nothing is copied".

Another reason of aliasing to str is I wanted to make bytestr a real type
so that it can be inherited or tested by isinstance().

Patch

diff --git a/mercurial/pycompat.py b/mercurial/pycompat.py
--- a/mercurial/pycompat.py
+++ b/mercurial/pycompat.py
@@ -76,6 +76,67 @@  if ispy3:
 
     bytechr = struct.Struct('>B').pack
 
+    class bytestr(bytes):
+        """A bytes which mostly acts as a Python 2 str
+
+        >>> bytestr(), bytestr(bytearray(b'foo')), bytestr(u'ascii'), bytestr(1)
+        (b'', b'foo', b'ascii', b'1')
+        >>> s = bytestr(b'foo')
+        >>> assert s is bytestr(s)
+
+        There's no implicit conversion from non-ascii str as its encoding is
+        unknown:
+
+        >>> bytestr(chr(0x80)) # doctest: +ELLIPSIS
+        Traceback (most recent call last):
+          ...
+        UnicodeEncodeError: ...
+
+        Comparison between bytestr and bytes should work:
+
+        >>> assert bytestr(b'foo') == b'foo'
+        >>> assert b'foo' == bytestr(b'foo')
+        >>> assert b'f' in bytestr(b'foo')
+        >>> assert bytestr(b'f') in b'foo'
+
+        Sliced elements should be bytes, not integer:
+
+        >>> s[1], s[:2]
+        (b'o', b'fo')
+        >>> list(s), list(reversed(s))
+        ([b'f', b'o', b'o'], [b'o', b'o', b'f'])
+
+        As bytestr type isn't propagated across operations, you need to cast
+        bytes to bytestr explicitly:
+
+        >>> s = bytestr(b'foo').upper()
+        >>> t = bytestr(s)
+        >>> s[0], t[0]
+        (70, b'F')
+
+        Be careful to not pass a bytestr object to a function which expects
+        bytearray-like behavior.
+
+        >>> t = bytes(t)  # cast to bytes
+        >>> assert type(t) is bytes
+        """
+
+        def __new__(cls, s=b''):
+            if isinstance(s, bytestr):
+                return s
+            if not isinstance(s, (bytes, bytearray)):
+                s = str(s).encode(u'ascii')
+            return bytes.__new__(cls, s)
+
+        def __getitem__(self, key):
+            s = bytes.__getitem__(self, key)
+            if not isinstance(s, bytes):
+                s = bytechr(s)
+            return s
+
+        def __iter__(self):
+            return iterbytestr(bytes.__iter__(self))
+
     def iterbytestr(s):
         """Iterate bytes as if it were a str object of Python 2"""
         return map(bytechr, s)
@@ -146,6 +207,7 @@  else:
     import cStringIO
 
     bytechr = chr
+    bytestr = str
     iterbytestr = iter
 
     def sysstr(s):
diff --git a/tests/test-doctest.py b/tests/test-doctest.py
--- a/tests/test-doctest.py
+++ b/tests/test-doctest.py
@@ -34,6 +34,7 @@  testmod('mercurial.minirst')
 testmod('mercurial.patch')
 testmod('mercurial.pathutil')
 testmod('mercurial.parser')
+testmod('mercurial.pycompat', py3=True)
 testmod('mercurial.revsetlang')
 testmod('mercurial.smartset')
 testmod('mercurial.store')