Patchwork [1,of,3,V3] py3: make a bytes version of getopt.getopt()

login
register
mail settings
Submitter Pulkit Goyal
Date Dec. 8, 2016, 7:12 p.m.
Message ID <e6e1c531a879c091caea.1481224357@pulkit-goyal>
Download mbox | patch
Permalink /patch/17878/
State Superseded
Headers show

Comments

Pulkit Goyal - Dec. 8, 2016, 7:12 p.m.
# HG changeset patch
# User Pulkit Goyal <7895pulkit@gmail.com>
# Date 1480986396 -19800
#      Tue Dec 06 06:36:36 2016 +0530
# Node ID e6e1c531a879c091caeaf7597744e98bcfbb41c9
# Parent  a2b053b8d31aa01b1dcae2d3001b060ff59e8a68
py3: make a bytes version of getopt.getopt()

getopt.getopt() deals with unicodes on Python 3 internally and if bytes
arguments are passed, then it will return TypeError. So we have now
pycompat.getoptb() which takes bytes arguments, convert them to unicode, call
getopt.getopt() and then convert the returned value back to bytes and then
return those value.
All the instances of getopt.getopt() are replaced with pycompat.getoptb().
Yuya Nishihara - Dec. 9, 2016, 10:46 a.m.
On Fri, 09 Dec 2016 00:42:37 +0530, Pulkit Goyal wrote:
> # HG changeset patch
> # User Pulkit Goyal <7895pulkit@gmail.com>
> # Date 1480986396 -19800
> #      Tue Dec 06 06:36:36 2016 +0530
> # Node ID e6e1c531a879c091caeaf7597744e98bcfbb41c9
> # Parent  a2b053b8d31aa01b1dcae2d3001b060ff59e8a68
> py3: make a bytes version of getopt.getopt()

> +    # getopt.getopt() on Python 3 deals with unicodes internally so we cannot
> +    # pass bytes there. Passing unicodes will result in unicodes as return
> +    # values which we need to convert again to bytes.
> +    def getoptb(args, shortlist, namelist):
> +        # There are chances when args, shorlist or namelist variables can be
> +        # unicodes, because maybe they are result of sys.argv like in statprof
> +        # or some other reasons. So it's better to check instance rather than
> +        # getting an AttributeError.
> +        args = [a.decode('latin-1') if isinstance(a, bytes) else a
> +                                    for a in args]
> +        if isinstance(shortlist, bytes):
> +            shortlist = shortlist.decode('latin-1')
> +        namelist = [a.decode('latin-1') if isinstance(a, bytes) else a
> +                                    for a in namelist]

IMO, passing unicode variables is invalid use of getoptb(). If they were
unicode type, it would be wrong to convert them back by .encode('latin-1').
We have no idea what's the expected encoding.

> +        opts, args = getopt.getopt(args, shortlist, namelist)
> +        # Returned value is always str, so no need to check instance.
> +        opts = [(a[0].enocde('latin-1'), a[1].enocde('latin-1'))
> +                            for a in opts]
> +        args = [a.encode('latin-1') for a in args]
> +        return opts, args
Pulkit Goyal - Dec. 9, 2016, 11:01 a.m.
On Fri, Dec 9, 2016 at 4:16 PM, Yuya Nishihara <yuya@tcha.org> wrote:
> On Fri, 09 Dec 2016 00:42:37 +0530, Pulkit Goyal wrote:
>> # HG changeset patch
>> # User Pulkit Goyal <7895pulkit@gmail.com>
>> # Date 1480986396 -19800
>> #      Tue Dec 06 06:36:36 2016 +0530
>> # Node ID e6e1c531a879c091caeaf7597744e98bcfbb41c9
>> # Parent  a2b053b8d31aa01b1dcae2d3001b060ff59e8a68
>> py3: make a bytes version of getopt.getopt()
>
>> +    # getopt.getopt() on Python 3 deals with unicodes internally so we cannot
>> +    # pass bytes there. Passing unicodes will result in unicodes as return
>> +    # values which we need to convert again to bytes.
>> +    def getoptb(args, shortlist, namelist):
>> +        # There are chances when args, shorlist or namelist variables can be
>> +        # unicodes, because maybe they are result of sys.argv like in statprof
>> +        # or some other reasons. So it's better to check instance rather than
>> +        # getting an AttributeError.
>> +        args = [a.decode('latin-1') if isinstance(a, bytes) else a
>> +                                    for a in args]
>> +        if isinstance(shortlist, bytes):
>> +            shortlist = shortlist.decode('latin-1')
>> +        namelist = [a.decode('latin-1') if isinstance(a, bytes) else a
>> +                                    for a in namelist]
>
> IMO, passing unicode variables is invalid use of getoptb(). If they were
> unicode type, it would be wrong to convert them back by .encode('latin-1').
> We have no idea what's the expected encoding.

Okay, that means getoptb() is espected to get bytes only and since we
will be passing bytes, we can safely use .encode('latin-1') because we
decoded them the same way before passing into getopt.getopt(). I will
resend a V4.

>> +        opts, args = getopt.getopt(args, shortlist, namelist)
>> +        # Returned value is always str, so no need to check instance.
>> +        opts = [(a[0].enocde('latin-1'), a[1].enocde('latin-1'))
>> +                            for a in opts]
>> +        args = [a.encode('latin-1') for a in args]
>> +        return opts, args

Patch

diff -r a2b053b8d31a -r e6e1c531a879 mercurial/fancyopts.py
--- a/mercurial/fancyopts.py	Tue Dec 06 11:44:49 2016 +0000
+++ b/mercurial/fancyopts.py	Tue Dec 06 06:36:36 2016 +0530
@@ -7,10 +7,11 @@ 
 
 from __future__ import absolute_import
 
-import getopt
-
 from .i18n import _
-from . import error
+from . import (
+    error,
+    pycompat,
+)
 
 # Set of flags to not apply boolean negation logic on
 nevernegate = set([
@@ -34,13 +35,14 @@ 
         stopindex = args.index('--')
         extraargs = args[stopindex + 1:]
         args = args[:stopindex]
-    opts, parseargs = getopt.getopt(args, options, longoptions)
+    opts, parseargs = pycompat.getoptb(args, options, longoptions)
     args = []
     while parseargs:
         arg = parseargs.pop(0)
         if arg and arg[0] == '-' and len(arg) > 1:
             parseargs.insert(0, arg)
-            topts, newparseargs = getopt.getopt(parseargs, options, longoptions)
+            topts, newparseargs = pycompat.getoptb(parseargs,\
+                                            options, longoptions)
             opts = opts + topts
             parseargs = newparseargs
         else:
@@ -125,7 +127,7 @@ 
     if gnu:
         parse = gnugetopt
     else:
-        parse = getopt.getopt
+        parse = pycompat.getoptb
     opts, args = parse(args, shortlist, namelist)
 
     # transfer result to state
diff -r a2b053b8d31a -r e6e1c531a879 mercurial/pycompat.py
--- a/mercurial/pycompat.py	Tue Dec 06 11:44:49 2016 +0000
+++ b/mercurial/pycompat.py	Tue Dec 06 06:36:36 2016 +0530
@@ -10,6 +10,7 @@ 
 
 from __future__ import absolute_import
 
+import getopt
 import os
 import sys
 
@@ -87,6 +88,27 @@ 
     setattr = _wrapattrfunc(builtins.setattr)
     xrange = builtins.range
 
+    # getopt.getopt() on Python 3 deals with unicodes internally so we cannot
+    # pass bytes there. Passing unicodes will result in unicodes as return
+    # values which we need to convert again to bytes.
+    def getoptb(args, shortlist, namelist):
+        # There are chances when args, shorlist or namelist variables can be
+        # unicodes, because maybe they are result of sys.argv like in statprof
+        # or some other reasons. So it's better to check instance rather than
+        # getting an AttributeError.
+        args = [a.decode('latin-1') if isinstance(a, bytes) else a
+                                    for a in args]
+        if isinstance(shortlist, bytes):
+            shortlist = shortlist.decode('latin-1')
+        namelist = [a.decode('latin-1') if isinstance(a, bytes) else a
+                                    for a in namelist]
+        opts, args = getopt.getopt(args, shortlist, namelist)
+        # Returned value is always str, so no need to check instance.
+        opts = [(a[0].enocde('latin-1'), a[1].enocde('latin-1'))
+                            for a in opts]
+        args = [a.encode('latin-1') for a in args]
+        return opts, args
+
 else:
     def sysstr(s):
         return s
@@ -106,6 +128,9 @@ 
     def fsdecode(filename):
         return filename
 
+    def getoptb(args, shortlist, namelist):
+        return getopt.getopt(args, shortlist, namelist)
+
     osname = os.name
     ospathsep = os.pathsep
     ossep = os.sep
diff -r a2b053b8d31a -r e6e1c531a879 mercurial/statprof.py
--- a/mercurial/statprof.py	Tue Dec 06 11:44:49 2016 +0000
+++ b/mercurial/statprof.py	Tue Dec 06 06:36:36 2016 +0530
@@ -116,6 +116,10 @@ 
 import threading
 import time
 
+from . import (
+    pycompat,
+)
+
 defaultdict = collections.defaultdict
 contextmanager = contextlib.contextmanager
 
@@ -771,7 +775,7 @@ 
 
     # process options
     try:
-        opts, args = getopt.getopt(sys.argv[optstart:], "hl:f:o:p:",
+        opts, args = pycompat.getoptb(sys.argv[optstart:], "hl:f:o:p:",
                                    ["help", "limit=", "file=", "output-file=", "script-path="])
     except getopt.error as msg:
         print(msg)