Patchwork [2,of,3] encoding: add option to escape non-ascii characters in JSON

login
register
mail settings
Submitter Yuya Nishihara
Date Jan. 16, 2016, 12:33 p.m.
Message ID <b3b1bef76d54a4755a1b.1452947634@mimosa>
Download mbox | patch
Permalink /patch/12792/
State Superseded
Commit 9ece901f7a19f4913c473a4da029ea17a73dbe98
Headers show

Comments

Yuya Nishihara - Jan. 16, 2016, 12:33 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1451212114 -32400
#      Sun Dec 27 19:28:34 2015 +0900
# Node ID b3b1bef76d54a4755a1b221a36b00253eefefd9a
# Parent  a3c68ee9fac119b70c0b515081f161fb708deca7
encoding: add option to escape non-ascii characters in JSON

This is necessary for hgweb to embed JSON data in HTML. JSON data must be
able to be embedded in non-UTF-8 HTML page so long as the page encoding is
compatible with ASCII.

According to RFC 7159, non-BMP character is represented as UTF-16 surrogate
pair. This function first splits an input string into an array of UTF-16
code points.

https://tools.ietf.org/html/rfc7159.html#section-7

Patch

diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -7,6 +7,7 @@ 
 
 from __future__ import absolute_import
 
+import array
 import locale
 import os
 import unicodedata
@@ -379,8 +380,9 @@  class normcasespecs(object):
     other = 0
 
 _jsonmap = {}
+_paranoidjsonmap = {}
 
-def jsonescape(s):
+def jsonescape(s, paranoid=False):
     '''returns a string suitable for JSON
 
     JSON is problematic for us because it doesn't support non-Unicode
@@ -405,12 +407,24 @@  def jsonescape(s):
     'utf-8: caf\\xc3\\xa9'
     >>> jsonescape('')
     ''
+
+    If paranoid, non-ascii characters are also escaped. This is suitable for
+    web output.
+
+    >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
+    'escape boundary: ~ \\\\u007f \\\\u0080'
+    >>> jsonescape('a weird byte: \\xdd', paranoid=True)
+    'a weird byte: \\\\udcdd'
+    >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
+    'utf-8: caf\\\\u00e9'
+    >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
+    'non-BMP: \\\\ud834\\\\udd1e'
     '''
 
     if not _jsonmap:
         for x in xrange(32):
             _jsonmap[chr(x)] = "\\u%04x" % x
-        for x in xrange(32, 256):
+        for x in xrange(32, 127):
             c = chr(x)
             _jsonmap[c] = c
         _jsonmap['\x7f'] = '\\u007f'
@@ -421,8 +435,27 @@  def jsonescape(s):
         _jsonmap['\b'] = '\\b'
         _jsonmap['\f'] = '\\f'
         _jsonmap['\r'] = '\\r'
+        _paranoidjsonmap.update(_jsonmap)
+        for x in xrange(128, 256):
+            c = chr(x)
+            _jsonmap[c] = c
 
-    return ''.join(_jsonmap[c] for c in toutf8b(s))
+    if paranoid:
+        jm = _paranoidjsonmap
+    else:
+        jm = _jsonmap
+
+    u8chars = toutf8b(s)
+    try:
+        return ''.join(jm[c] for c in u8chars)  # fast path
+    except KeyError:
+        pass
+    # non-BMP char is represented as UTF-16 surrogate pair
+    u16codes = array.array('H')
+    u16codes.fromstring(u8chars.decode('utf-8').encode('utf-16'))
+    u16codes.pop(0)  # drop BOM
+    return ''.join(jm[chr(x)] if x < 128 else '\\u%04x' % x
+                   for x in u16codes)
 
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]