Patchwork [3,of,4,V2] encoding: add option to escape non-ascii characters in JSON

login
register
mail settings
Submitter Yuya Nishihara
Date Feb. 9, 2016, 3:40 p.m.
Message ID <16123be761e3f990f7e1.1455032418@mimosa>
Download mbox | patch
Permalink /patch/13063/
State Accepted
Headers show

Comments

Yuya Nishihara - Feb. 9, 2016, 3:40 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1451212114 -32400
#      Sun Dec 27 19:28:34 2015 +0900
# Node ID 16123be761e3f990f7e112be88cc73f9d21893b7
# Parent  13a9f3f8ffe0487e3ad2b2bc31ff8e4cfa755d5c
encoding: add option to escape non-ascii characters in JSON

This is necessary for hgweb to embed JSON data in HTML. JSON data must be
able to be embedded in non-UTF-8 HTML page so long as the page encoding is
compatible with ASCII.

According to RFC 7159, non-BMP character is represented as UTF-16 surrogate
pair. This function first splits an input string into an array of UTF-16
code points.

https://tools.ietf.org/html/rfc7159.html#section-7

Patch

diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -7,6 +7,7 @@ 
 
 from __future__ import absolute_import
 
+import array
 import locale
 import os
 import unicodedata
@@ -380,8 +381,8 @@  class normcasespecs(object):
 
 _jsonmap = []
 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
-_jsonmap.extend(chr(x) for x in xrange(32, 256))
-_jsonmap[0x7f] = '\\u007f'
+_jsonmap.extend(chr(x) for x in xrange(32, 127))
+_jsonmap.append('\\u007f')
 _jsonmap[0x09] = '\\t'
 _jsonmap[0x0a] = '\\n'
 _jsonmap[0x22] = '\\"'
@@ -389,8 +390,10 @@  class normcasespecs(object):
 _jsonmap[0x08] = '\\b'
 _jsonmap[0x0c] = '\\f'
 _jsonmap[0x0d] = '\\r'
+_paranoidjsonmap = _jsonmap[:]
+_jsonmap.extend(chr(x) for x in xrange(128, 256))
 
-def jsonescape(s):
+def jsonescape(s, paranoid=False):
     '''returns a string suitable for JSON
 
     JSON is problematic for us because it doesn't support non-Unicode
@@ -415,9 +418,34 @@  def jsonescape(s):
     'utf-8: caf\\xc3\\xa9'
     >>> jsonescape('')
     ''
+
+    If paranoid, non-ascii characters are also escaped. This is suitable for
+    web output.
+
+    >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
+    'escape boundary: ~ \\\\u007f \\\\u0080'
+    >>> jsonescape('a weird byte: \\xdd', paranoid=True)
+    'a weird byte: \\\\udcdd'
+    >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
+    'utf-8: caf\\\\u00e9'
+    >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
+    'non-BMP: \\\\ud834\\\\udd1e'
     '''
 
-    return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
+    if paranoid:
+        jm = _paranoidjsonmap
+    else:
+        jm = _jsonmap
+
+    u8chars = toutf8b(s)
+    try:
+        return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
+    except IndexError:
+        pass
+    # non-BMP char is represented as UTF-16 surrogate pair
+    u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
+    u16codes.pop(0)  # drop BOM
+    return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
 
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]