Patchwork [1,of,5] encoding: extract stub for fast JSON escape

login
register
mail settings
Submitter Yuya Nishihara
Date Aug. 18, 2017, 2:14 p.m.
Message ID <574c98346cfba33b4de0.1503065649@mimosa>
Download mbox | patch
Permalink /patch/23113/
State Accepted
Headers show

Comments

Yuya Nishihara - Aug. 18, 2017, 2:14 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1492931451 -32400
#      Sun Apr 23 16:10:51 2017 +0900
# Node ID 574c98346cfba33b4de02d089d47f4923f1ff47f
# Parent  6f6c87888b228948e202bd5967dc306bed56af7d
encoding: extract stub for fast JSON escape

This moves JSON character maps to pure/charencode.py because they will be
used only when the fast-path fails.

Patch

diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -7,7 +7,6 @@ 
 
 from __future__ import absolute_import
 
-import array
 import io
 import locale
 import os
@@ -19,10 +18,15 @@  from . import (
     pycompat,
 )
 
+from .pure import (
+    charencode as charencodepure,
+)
+
 charencode = policy.importmod(r'charencode')
 
 asciilower = charencode.asciilower
 asciiupper = charencode.asciiupper
+_jsonescapeu8fast = charencodepure.jsonescapeu8fast  # TODO: no "pure"
 
 _sysstr = pycompat.sysstr
 
@@ -383,22 +387,6 @@  class normcasespecs(object):
     upper = 1
     other = 0
 
-_jsonmap = []
-_jsonmap.extend("\\u%04x" % x for x in range(32))
-_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
-_jsonmap.append('\\u007f')
-_jsonmap[0x09] = '\\t'
-_jsonmap[0x0a] = '\\n'
-_jsonmap[0x22] = '\\"'
-_jsonmap[0x5c] = '\\\\'
-_jsonmap[0x08] = '\\b'
-_jsonmap[0x0c] = '\\f'
-_jsonmap[0x0d] = '\\r'
-_paranoidjsonmap = _jsonmap[:]
-_paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
-_paranoidjsonmap[0x3e] = '\\u003e'  # '>'
-_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
-
 def jsonescape(s, paranoid=False):
     '''returns a string suitable for JSON
 
@@ -440,20 +428,12 @@  def jsonescape(s, paranoid=False):
     '\\\\u003cfoo@example.org\\\\u003e'
     '''
 
-    if paranoid:
-        jm = _paranoidjsonmap
-    else:
-        jm = _jsonmap
-
     u8chars = toutf8b(s)
     try:
-        return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
-    except IndexError:
+        return _jsonescapeu8fast(u8chars, paranoid)
+    except ValueError:
         pass
-    # non-BMP char is represented as UTF-16 surrogate pair
-    u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
-    u16codes.pop(0)  # drop BOM
-    return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
+    return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
 
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
 
diff --git a/mercurial/pure/charencode.py b/mercurial/pure/charencode.py
--- a/mercurial/pure/charencode.py
+++ b/mercurial/pure/charencode.py
@@ -7,6 +7,12 @@ 
 
 from __future__ import absolute_import
 
+import array
+
+from .. import (
+    pycompat,
+)
+
 def asciilower(s):
     '''convert a string to lowercase if ASCII
 
@@ -20,3 +26,47 @@  def asciiupper(s):
     Raises UnicodeDecodeError if non-ASCII characters are found.'''
     s.decode('ascii')
     return s.upper()
+
+_jsonmap = []
+_jsonmap.extend("\\u%04x" % x for x in range(32))
+_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127))
+_jsonmap.append('\\u007f')
+_jsonmap[0x09] = '\\t'
+_jsonmap[0x0a] = '\\n'
+_jsonmap[0x22] = '\\"'
+_jsonmap[0x5c] = '\\\\'
+_jsonmap[0x08] = '\\b'
+_jsonmap[0x0c] = '\\f'
+_jsonmap[0x0d] = '\\r'
+_paranoidjsonmap = _jsonmap[:]
+_paranoidjsonmap[0x3c] = '\\u003c'  # '<' (e.g. escape "</script>")
+_paranoidjsonmap[0x3e] = '\\u003e'  # '>'
+_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256))
+
+def jsonescapeu8fast(u8chars, paranoid):
+    """Convert a UTF-8 byte string to JSON-escaped form (fast path)
+
+    Raises ValueError if non-ASCII characters have to be escaped.
+    """
+    if paranoid:
+        jm = _paranoidjsonmap
+    else:
+        jm = _jsonmap
+    try:
+        return ''.join(jm[x] for x in bytearray(u8chars))
+    except IndexError:
+        raise ValueError
+
+def jsonescapeu8fallback(u8chars, paranoid):
+    """Convert a UTF-8 byte string to JSON-escaped form (slow path)
+
+    Escapes all non-ASCII characters no matter if paranoid is False.
+    """
+    if paranoid:
+        jm = _paranoidjsonmap
+    else:
+        jm = _jsonmap
+    # non-BMP char is represented as UTF-16 surrogate pair
+    u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
+    u16codes.pop(0)  # drop BOM
+    return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)