Patchwork [06,of,10] py3: use unicode literals and unichr in revset.py

login
register
mail settings
Submitter Pulkit Goyal
Date Aug. 2, 2016, 8:27 p.m.
Message ID <da4a0ba184d3eff2819d.1470169648@pulkit-goyal>
Download mbox | patch
Permalink /patch/16045/
State Changes Requested
Headers show

Comments

Pulkit Goyal - Aug. 2, 2016, 8:27 p.m.
# HG changeset patch
# User Pulkit Goyal <7895pulkit@gmail.com>
# Date 1470168209 -19800
#      Wed Aug 03 01:33:29 2016 +0530
# Node ID da4a0ba184d3eff2819d73884770d342edce88c1
# Parent  4547ab529d26196dc40909693b5e9673763e9058
py3: use unicode literals and unichr in revset.py

The assignment of _syminitletters, _symletters, and _aliassyminitletters
didn't work under Python 3 because of mixed types. We rewrite the code
to work under both Python 2 and Python 3 by using unichr and
unicode literals.

We preserve the final type of elements in the sets as bytes.
Yuya Nishihara - Aug. 3, 2016, 2:01 p.m.
On Wed, 03 Aug 2016 01:57:28 +0530, Pulkit Goyal wrote:
> # HG changeset patch
> # User Pulkit Goyal <7895pulkit@gmail.com>
> # Date 1470168209 -19800
> #      Wed Aug 03 01:33:29 2016 +0530
> # Node ID da4a0ba184d3eff2819d73884770d342edce88c1
> # Parent  4547ab529d26196dc40909693b5e9673763e9058
> py3: use unicode literals and unichr in revset.py
> 
> The assignment of _syminitletters, _symletters, and _aliassyminitletters
> didn't work under Python 3 because of mixed types. We rewrite the code
> to work under both Python 2 and Python 3 by using unichr and
> unicode literals.
> 
> We preserve the final type of elements in the sets as bytes.
> 
> diff -r 4547ab529d26 -r da4a0ba184d3 mercurial/revset.py
> --- a/mercurial/revset.py	Wed Aug 03 01:20:15 2016 +0530
> +++ b/mercurial/revset.py	Wed Aug 03 01:33:29 2016 +0530
> @@ -9,6 +9,7 @@
>  
>  import heapq
>  import re
> +import sys
>  
>  from .i18n import _
>  from . import (
> @@ -27,6 +28,9 @@
>      util,
>  )
>  
> +if sys.version_info[0]>=3:
> +    unichr = chr
> +
>  def _revancestors(repo, revs, followfirst):
>      """Like revlog.ancestors(), but supports followfirst."""
>      if followfirst:
> @@ -175,12 +179,12 @@
>  keywords = set(['and', 'or', 'not'])
>  
>  # default set of valid characters for the initial letter of symbols
> -_syminitletters = set(c for c in [chr(i) for i in xrange(256)]
> -                      if c.isalnum() or c in '._@' or ord(c) > 127)
> +_syminitletters = set(c.encode("latin1") for c in [unichr(i) for i in xrange(256)]
> +                      if c.isalnum() or c in u'._@' or ord(c) > 127)
>  
>  # default set of valid characters for non-initial letters of symbols
> -_symletters = set(c for c in  [chr(i) for i in xrange(256)]
> -                  if c.isalnum() or c in '-._/@' or ord(c) > 127)
> +_symletters = set(c.encode("latin-1") for c in  [unichr(i) for i in xrange(256)]
> +                  if c.isalnum() or c in u'-._/@' or ord(c) > 127)
>  
>  def tokenize(program, lookup=None, syminitletters=None, symletters=None):
>      '''
> @@ -2450,8 +2454,8 @@
>  
>  # the set of valid characters for the initial letter of symbols in
>  # alias declarations and definitions
> -_aliassyminitletters = set(c for c in [chr(i) for i in xrange(256)]
> -                           if c.isalnum() or c in '._@$' or ord(c) > 127)
> +_aliassyminitletters = set(c.encode("latin-1") for c in [unichr(i) for i in xrange(256)]
> +                           if c.isalnum() or c in u'._@$' or ord(c) > 127)

Seems like going to the opposite direction. Instead, can we have byteschr()
on Py3 for example?

Patch

diff -r 4547ab529d26 -r da4a0ba184d3 mercurial/revset.py
--- a/mercurial/revset.py	Wed Aug 03 01:20:15 2016 +0530
+++ b/mercurial/revset.py	Wed Aug 03 01:33:29 2016 +0530
@@ -9,6 +9,7 @@ 
 
 import heapq
 import re
+import sys
 
 from .i18n import _
 from . import (
@@ -27,6 +28,9 @@ 
     util,
 )
 
+if sys.version_info[0]>=3:
+    unichr = chr
+
 def _revancestors(repo, revs, followfirst):
     """Like revlog.ancestors(), but supports followfirst."""
     if followfirst:
@@ -175,12 +179,12 @@ 
 keywords = set(['and', 'or', 'not'])
 
 # default set of valid characters for the initial letter of symbols
-_syminitletters = set(c for c in [chr(i) for i in xrange(256)]
-                      if c.isalnum() or c in '._@' or ord(c) > 127)
+_syminitletters = set(c.encode("latin1") for c in [unichr(i) for i in xrange(256)]
+                      if c.isalnum() or c in u'._@' or ord(c) > 127)
 
 # default set of valid characters for non-initial letters of symbols
-_symletters = set(c for c in  [chr(i) for i in xrange(256)]
-                  if c.isalnum() or c in '-._/@' or ord(c) > 127)
+_symletters = set(c.encode("latin-1") for c in  [unichr(i) for i in xrange(256)]
+                  if c.isalnum() or c in u'-._/@' or ord(c) > 127)
 
 def tokenize(program, lookup=None, syminitletters=None, symletters=None):
     '''
@@ -2450,8 +2454,8 @@ 
 
 # the set of valid characters for the initial letter of symbols in
 # alias declarations and definitions
-_aliassyminitletters = set(c for c in [chr(i) for i in xrange(256)]
-                           if c.isalnum() or c in '._@$' or ord(c) > 127)
+_aliassyminitletters = set(c.encode("latin-1") for c in [unichr(i) for i in xrange(256)]
+                           if c.isalnum() or c in u'._@$' or ord(c) > 127)
 
 def _parsewith(spec, lookup=None, syminitletters=None):
     """Generate a parse tree of given spec with given tokenizing options