Patchwork [3,of,3,STABLE] py3: define and use json.loads polyfill

login
register
mail settings
Submitter Gregory Szorc
Date Nov. 2, 2019, 7:19 p.m.
Message ID <b4a9220a4eb5a6189102.1572722398@ubuntu-vm-main>
Download mbox | patch
Permalink /patch/42684/
State Accepted
Headers show

Comments

Gregory Szorc - Nov. 2, 2019, 7:19 p.m.
# HG changeset patch
# User Gregory Szorc <gregory.szorc@gmail.com>
# Date 1572721775 25200
#      Sat Nov 02 12:09:35 2019 -0700
# Branch stable
# Node ID b4a9220a4eb5a61891027d59cd0bded6dc5f7578
# Parent  0d0cd63ca1702b901df2cc021d1f51c77bc0bf61
py3: define and use json.loads polyfill

Python 3.5's json.loads() requires a str. Only Python 3.6+
supports passing a bytes or bytearray.

This commit implements a json.loads() polyfill on Python 3.5
so that we can use bytes. The added function to detect encodings
comes verbatim from Python 3.7.
Yuya Nishihara - Nov. 3, 2019, 2:13 a.m.
On Sat, 02 Nov 2019 12:19:58 -0700, Gregory Szorc wrote:
> # HG changeset patch
> # User Gregory Szorc <gregory.szorc@gmail.com>
> # Date 1572721775 25200
> #      Sat Nov 02 12:09:35 2019 -0700
> # Branch stable
> # Node ID b4a9220a4eb5a61891027d59cd0bded6dc5f7578
> # Parent  0d0cd63ca1702b901df2cc021d1f51c77bc0bf61
> py3: define and use json.loads polyfill

Queued for stable, thanks.

Just wondered if we can drop support for Python 3.5.

> +    # Python 3.5's json.load and json.loads require str. We polyfill its
> +    # code for detecting encoding from bytes.
> +    if sys.version_info[0:2] < (3, 6):
> +
> +        def _detect_encoding(b):
> +            bstartswith = b.startswith
> +            if bstartswith((codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE)):
> +                return 'utf-32'
> +            if bstartswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
> +                return 'utf-16'
> +            if bstartswith(codecs.BOM_UTF8):
> +                return 'utf-8-sig'
> +
> +            if len(b) >= 4:
> +                if not b[0]:
> +                    # 00 00 -- -- - utf-32-be
> +                    # 00 XX -- -- - utf-16-be
> +                    return 'utf-16-be' if b[1] else 'utf-32-be'
> +                if not b[1]:
> +                    # XX 00 00 00 - utf-32-le
> +                    # XX 00 00 XX - utf-16-le
> +                    # XX 00 XX -- - utf-16-le
> +                    return 'utf-16-le' if b[2] or b[3] else 'utf-32-le'
> +            elif len(b) == 2:
> +                if not b[0]:
> +                    # 00 XX - utf-16-be
> +                    return 'utf-16-be'
> +                if not b[1]:
> +                    # XX 00 - utf-16-le
> +                    return 'utf-16-le'
> +            # default
> +            return 'utf-8'

There should be no need to support encodings other than utf-8 (as Python 2.7
wouldn't), but it's copy-pasta so I don't care.
Gregory Szorc - Nov. 3, 2019, 3:45 a.m.
On Sat, Nov 2, 2019 at 7:14 PM Yuya Nishihara <yuya@tcha.org> wrote:

> On Sat, 02 Nov 2019 12:19:58 -0700, Gregory Szorc wrote:
> > # HG changeset patch
> > # User Gregory Szorc <gregory.szorc@gmail.com>
> > # Date 1572721775 25200
> > #      Sat Nov 02 12:09:35 2019 -0700
> > # Branch stable
> > # Node ID b4a9220a4eb5a61891027d59cd0bded6dc5f7578
> > # Parent  0d0cd63ca1702b901df2cc021d1f51c77bc0bf61
> > py3: define and use json.loads polyfill
>
> Queued for stable, thanks.
>
> Just wondered if we can drop support for Python 3.5.
>

I would entertain dropping Python 3.5. Are there any popular distros
shipping it? We're good on RHEL and Debian/Ubuntu, I think...


>
> > +    # Python 3.5's json.load and json.loads require str. We polyfill its
> > +    # code for detecting encoding from bytes.
> > +    if sys.version_info[0:2] < (3, 6):
> > +
> > +        def _detect_encoding(b):
> > +            bstartswith = b.startswith
> > +            if bstartswith((codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE)):
> > +                return 'utf-32'
> > +            if bstartswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
> > +                return 'utf-16'
> > +            if bstartswith(codecs.BOM_UTF8):
> > +                return 'utf-8-sig'
> > +
> > +            if len(b) >= 4:
> > +                if not b[0]:
> > +                    # 00 00 -- -- - utf-32-be
> > +                    # 00 XX -- -- - utf-16-be
> > +                    return 'utf-16-be' if b[1] else 'utf-32-be'
> > +                if not b[1]:
> > +                    # XX 00 00 00 - utf-32-le
> > +                    # XX 00 00 XX - utf-16-le
> > +                    # XX 00 XX -- - utf-16-le
> > +                    return 'utf-16-le' if b[2] or b[3] else 'utf-32-le'
> > +            elif len(b) == 2:
> > +                if not b[0]:
> > +                    # 00 XX - utf-16-be
> > +                    return 'utf-16-be'
> > +                if not b[1]:
> > +                    # XX 00 - utf-16-le
> > +                    return 'utf-16-le'
> > +            # default
> > +            return 'utf-8'
>
> There should be no need to support encodings other than utf-8 (as Python
> 2.7
> wouldn't), but it's copy-pasta so I don't care.
>

Patch

diff --git a/hgext/bugzilla.py b/hgext/bugzilla.py
--- a/hgext/bugzilla.py
+++ b/hgext/bugzilla.py
@@ -955,7 +955,7 @@  class bzrestapi(bzaccess):
     def _fetch(self, burl):
         try:
             resp = url.open(self.ui, burl)
-            return json.loads(resp.read())
+            return pycompat.json_loads(resp.read())
         except util.urlerr.httperror as inst:
             if inst.code == 401:
                 raise error.Abort(_(b'authorization failed'))
@@ -978,7 +978,7 @@  class bzrestapi(bzaccess):
         req = request_type(burl, data, {b'Content-Type': b'application/json'})
         try:
             resp = url.opener(self.ui).open(req)
-            return json.loads(resp.read())
+            return pycompat.json_loads(resp.read())
         except util.urlerr.httperror as inst:
             if inst.code == 401:
                 raise error.Abort(_(b'authorization failed'))
diff --git a/hgext/fix.py b/hgext/fix.py
--- a/hgext/fix.py
+++ b/hgext/fix.py
@@ -126,7 +126,6 @@  from __future__ import absolute_import
 
 import collections
 import itertools
-import json
 import os
 import re
 import subprocess
@@ -642,7 +641,7 @@  def fixfile(ui, repo, opts, fixers, fixc
             if fixer.shouldoutputmetadata():
                 try:
                     metadatajson, newerdata = stdout.split(b'\0', 1)
-                    metadata[fixername] = json.loads(metadatajson)
+                    metadata[fixername] = pycompat.json_loads(metadatajson)
                 except ValueError:
                     ui.warn(
                         _(b'ignored invalid output from fixer tool: %s\n')
diff --git a/hgext/lfs/blobstore.py b/hgext/lfs/blobstore.py
--- a/hgext/lfs/blobstore.py
+++ b/hgext/lfs/blobstore.py
@@ -363,7 +363,7 @@  class _gitlfsremote(object):
                 _(b'LFS error: %s') % _urlerrorreason(ex), hint=hint
             )
         try:
-            response = json.loads(rawjson)
+            response = pycompat.json_loads(rawjson)
         except ValueError:
             raise LfsRemoteError(
                 _(b'LFS server returns invalid JSON: %s')
diff --git a/hgext/lfs/wireprotolfsserver.py b/hgext/lfs/wireprotolfsserver.py
--- a/hgext/lfs/wireprotolfsserver.py
+++ b/hgext/lfs/wireprotolfsserver.py
@@ -133,7 +133,7 @@  def _processbatchrequest(repo, req, res)
         return True
 
     # XXX: specify an encoding?
-    lfsreq = json.loads(req.bodyfh.read())
+    lfsreq = pycompat.json_loads(req.bodyfh.read())
 
     # If no transfer handlers are explicitly requested, 'basic' is assumed.
     if r'basic' not in lfsreq.get(r'transfers', [r'basic']):
diff --git a/hgext/phabricator.py b/hgext/phabricator.py
--- a/hgext/phabricator.py
+++ b/hgext/phabricator.py
@@ -152,8 +152,8 @@  def vcrcommand(name, flags, spec, helpca
             value = r1params[key][0]
             # we want to compare json payloads without worrying about ordering
             if value.startswith(b'{') and value.endswith(b'}'):
-                r1json = json.loads(value)
-                r2json = json.loads(r2params[key][0])
+                r1json = pycompat.json_loads(value)
+                r2json = pycompat.json_loads(r2params[key][0])
                 if r1json != r2json:
                     return False
             elif r2params[key][0] != value:
@@ -307,7 +307,7 @@  def callconduit(ui, name, params):
         if isinstance(x, pycompat.unicode)
         else x,
         # json.loads only accepts bytes from py3.6+
-        json.loads(encoding.unifromlocal(body)),
+        pycompat.json_loads(encoding.unifromlocal(body)),
     )
     if parsed.get(b'error_code'):
         msg = _(b'Conduit Error (%s): %s') % (
@@ -332,7 +332,7 @@  def debugcallconduit(ui, repo, name):
         lambda x: encoding.unitolocal(x)
         if isinstance(x, pycompat.unicode)
         else x,
-        json.loads(rawparams),
+        pycompat.json_loads(rawparams),
     )
     # json.dumps only accepts unicode strings
     result = pycompat.rapply(
diff --git a/mercurial/pycompat.py b/mercurial/pycompat.py
--- a/mercurial/pycompat.py
+++ b/mercurial/pycompat.py
@@ -12,6 +12,7 @@  from __future__ import absolute_import
 
 import getopt
 import inspect
+import json
 import os
 import shlex
 import sys
@@ -88,6 +89,7 @@  def rapply(f, xs):
 
 if ispy3:
     import builtins
+    import codecs
     import functools
     import io
     import struct
@@ -340,6 +342,48 @@  if ispy3:
     iteritems = lambda x: x.items()
     itervalues = lambda x: x.values()
 
+    # Python 3.5's json.load and json.loads require str. We polyfill its
+    # code for detecting encoding from bytes.
+    if sys.version_info[0:2] < (3, 6):
+
+        def _detect_encoding(b):
+            bstartswith = b.startswith
+            if bstartswith((codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE)):
+                return 'utf-32'
+            if bstartswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)):
+                return 'utf-16'
+            if bstartswith(codecs.BOM_UTF8):
+                return 'utf-8-sig'
+
+            if len(b) >= 4:
+                if not b[0]:
+                    # 00 00 -- -- - utf-32-be
+                    # 00 XX -- -- - utf-16-be
+                    return 'utf-16-be' if b[1] else 'utf-32-be'
+                if not b[1]:
+                    # XX 00 00 00 - utf-32-le
+                    # XX 00 00 XX - utf-16-le
+                    # XX 00 XX -- - utf-16-le
+                    return 'utf-16-le' if b[2] or b[3] else 'utf-32-le'
+            elif len(b) == 2:
+                if not b[0]:
+                    # 00 XX - utf-16-be
+                    return 'utf-16-be'
+                if not b[1]:
+                    # XX 00 - utf-16-le
+                    return 'utf-16-le'
+            # default
+            return 'utf-8'
+
+        def json_loads(s, *args, **kwargs):
+            if isinstance(s, (bytes, bytearray)):
+                s = s.decode(_detect_encoding(s), 'surrogatepass')
+
+            return json.loads(s, *args, **kwargs)
+
+    else:
+        json_loads = json.loads
+
 else:
     import cStringIO
 
@@ -417,6 +461,7 @@  else:
     getargspec = inspect.getargspec
     iteritems = lambda x: x.iteritems()
     itervalues = lambda x: x.itervalues()
+    json_loads = json.loads
 
 isjython = sysplatform.startswith(b'java')
 
diff --git a/tests/get-with-headers.py b/tests/get-with-headers.py
--- a/tests/get-with-headers.py
+++ b/tests/get-with-headers.py
@@ -98,7 +98,7 @@  def request(host, path, show):
         if formatjson:
             # json.dumps() will print trailing newlines. Eliminate them
             # to make tests easier to write.
-            data = json.loads(data)
+            data = pycompat.json_loads(data)
             lines = json.dumps(data, sort_keys=True, indent=2).splitlines()
             for line in lines:
                 bodyfh.write(pycompat.sysbytes(line.rstrip()))