Patchwork [2,of,5] encoding: add fast path of jsonescape() (issue5533)

login
register
mail settings
Submitter Yuya Nishihara
Date Aug. 18, 2017, 2:14 p.m.
Message ID <a72635d28242bec2c516.1503065650@mimosa>
Download mbox | patch
Permalink /patch/23114/
State Accepted
Headers show

Comments

Yuya Nishihara - Aug. 18, 2017, 2:14 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1492926472 -32400
#      Sun Apr 23 14:47:52 2017 +0900
# Node ID a72635d28242bec2c5165ef938bc4abc4a23e7b4
# Parent  574c98346cfba33b4de02d089d47f4923f1ff47f
encoding: add fast path of jsonescape() (issue5533)

This isn't highly optimized as it copies characters one by one, but seems
reasonably simple and not slow.

  (with mercurial repo)
  $ export HGRCPATH=/dev/null HGPLAIN=
  $ hg log --time --config experimental.stabilization=all -Tjson > /dev/null

  (original)
  time: real 6.830 secs (user 6.740+0.000 sys 0.080+0.000)
  time: real 6.690 secs (user 6.650+0.000 sys 0.040+0.000)
  time: real 6.700 secs (user 6.640+0.000 sys 0.060+0.000)

  (this patch)
  time: real 5.630 secs (user 5.550+0.000 sys 0.070+0.000)
  time: real 5.700 secs (user 5.650+0.000 sys 0.050+0.000)
  time: real 5.690 secs (user 5.640+0.000 sys 0.050+0.000)

Patch

diff --git a/mercurial/cext/charencode.c b/mercurial/cext/charencode.c
--- a/mercurial/cext/charencode.c
+++ b/mercurial/cext/charencode.c
@@ -9,6 +9,7 @@ 
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include <assert.h>
 
 #include "charencode.h"
 #include "util.h"
@@ -63,6 +64,42 @@  static const char uppertable[128] = {
 				'\x7b', '\x7c', '\x7d', '\x7e', '\x7f'
 };
 
+/* 1: no escape, 2: \<c>, 6: \u<x> */
+static const uint8_t jsonlentable[256] = {
+	6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 6, 2, 2, 6, 6, /* b, t, n, f, r */
+	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+	1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* " */
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, /* \\ */
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, /* DEL */
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+};
+
+static const uint8_t jsonparanoidlentable[128] = {
+	6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 6, 2, 2, 6, 6, /* b, t, n, f, r */
+	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+	1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* " */
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 6, 1, /* <, > */
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, /* \\ */
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, /* DEL */
+};
+
+static const char hexchartable[16] = {
+	'0', '1', '2', '3', '4', '5', '6', '7',
+	'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+};
+
 /*
  * Turn a hex-encoded string into binary.
  */
@@ -217,3 +254,105 @@  quit:
 	Py_XDECREF(file_foldmap);
 	return NULL;
 }
+
+/* calculate length of JSON-escaped string; returns -1 if unsupported */
+static Py_ssize_t jsonescapelen(const char *buf, Py_ssize_t len, bool paranoid)
+{
+	Py_ssize_t i, esclen = 0;
+
+	if (paranoid) {
+		/* don't want to process multi-byte escapes in C */
+		for (i = 0; i < len; i++) {
+			char c = buf[i];
+			if (c & 0x80) {
+				PyErr_SetString(PyExc_ValueError,
+						"cannot process non-ascii str");
+				return -1;
+			}
+			esclen += jsonparanoidlentable[(unsigned char)c];
+		}
+	} else {
+		for (i = 0; i < len; i++) {
+			char c = buf[i];
+			esclen += jsonlentable[(unsigned char)c];
+		}
+	}
+
+	return esclen;
+}
+
+/* map '\<c>' escape character */
+static char jsonescapechar2(char c)
+{
+	switch (c) {
+	case '\b': return 'b';
+	case '\t': return 't';
+	case '\n': return 'n';
+	case '\f': return 'f';
+	case '\r': return 'r';
+	case '"':  return '"';
+	case '\\': return '\\';
+	}
+	return '\0';  /* should not happen */
+}
+
+/* convert 'origbuf' to JSON-escaped form 'escbuf'; 'origbuf' should only
+   include characters mappable by json(paranoid)lentable */
+static void encodejsonescape(char *escbuf, Py_ssize_t esclen,
+			     const char *origbuf, Py_ssize_t origlen,
+			     bool paranoid)
+{
+	const uint8_t *lentable =
+		(paranoid) ? jsonparanoidlentable : jsonlentable;
+	Py_ssize_t i, j;
+
+	for (i = 0, j = 0; i < origlen; i++) {
+		char c = origbuf[i];
+		uint8_t l = lentable[(unsigned char)c];
+		assert(j + l <= esclen);
+		switch (l) {
+		case 1:
+			escbuf[j] = c;
+			break;
+		case 2:
+			escbuf[j] = '\\';
+			escbuf[j + 1] = jsonescapechar2(c);
+			break;
+		case 6:
+			memcpy(escbuf + j, "\\u00", 4);
+			escbuf[j + 4] = hexchartable[(unsigned char)c >> 4];
+			escbuf[j + 5] = hexchartable[(unsigned char)c & 0xf];
+			break;
+		}
+		j += l;
+	}
+}
+
+PyObject *jsonescapeu8fast(PyObject *self, PyObject *args)
+{
+	PyObject *origstr, *escstr;
+	const char *origbuf;
+	Py_ssize_t origlen, esclen;
+	int paranoid;
+	if (!PyArg_ParseTuple(args, "O!i:jsonescapeu8fast",
+			      &PyBytes_Type, &origstr, &paranoid))
+		return NULL;
+
+	origbuf = PyBytes_AS_STRING(origstr);
+	origlen = PyBytes_GET_SIZE(origstr);
+	esclen = jsonescapelen(origbuf, origlen, paranoid);
+	if (esclen < 0)
+		return NULL;  /* unsupported char found */
+	if (origlen == esclen) {
+		Py_INCREF(origstr);
+		return origstr;
+	}
+
+	escstr = PyBytes_FromStringAndSize(NULL, esclen);
+	if (!escstr)
+		return NULL;
+	encodejsonescape(PyBytes_AS_STRING(escstr), esclen, origbuf, origlen,
+			 paranoid);
+
+	return escstr;
+}
diff --git a/mercurial/cext/charencode.h b/mercurial/cext/charencode.h
--- a/mercurial/cext/charencode.h
+++ b/mercurial/cext/charencode.h
@@ -22,6 +22,7 @@  PyObject *unhexlify(const char *str, Py_
 PyObject *asciilower(PyObject *self, PyObject *args);
 PyObject *asciiupper(PyObject *self, PyObject *args);
 PyObject *make_file_foldmap(PyObject *self, PyObject *args);
+PyObject *jsonescapeu8fast(PyObject *self, PyObject *args);
 
 static const int8_t hextable[256] = {
 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
diff --git a/mercurial/cext/parsers.c b/mercurial/cext/parsers.c
--- a/mercurial/cext/parsers.c
+++ b/mercurial/cext/parsers.c
@@ -702,6 +702,8 @@  static PyMethodDef methods[] = {
 	 "construct a dict with an expected size\n"},
 	{"make_file_foldmap", make_file_foldmap, METH_VARARGS,
 	 "make file foldmap\n"},
+	{"jsonescapeu8fast", jsonescapeu8fast, METH_VARARGS,
+	 "escape a UTF-8 byte string to JSON (fast path)\n"},
 	{"encodedir", encodedir, METH_VARARGS, "encodedir a path\n"},
 	{"pathencode", pathencode, METH_VARARGS, "fncache-encode a path\n"},
 	{"lowerencode", lowerencode, METH_VARARGS, "lower-encode a path\n"},
@@ -714,7 +716,7 @@  void dirs_module_init(PyObject *mod);
 void manifest_module_init(PyObject *mod);
 void revlog_module_init(PyObject *mod);
 
-static const int version = 1;
+static const int version = 2;
 
 static void module_init(PyObject *mod)
 {
diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -26,7 +26,7 @@  charencode = policy.importmod(r'charenco
 
 asciilower = charencode.asciilower
 asciiupper = charencode.asciiupper
-_jsonescapeu8fast = charencodepure.jsonescapeu8fast  # TODO: no "pure"
+_jsonescapeu8fast = charencode.jsonescapeu8fast
 
 _sysstr = pycompat.sysstr
 
@@ -404,8 +404,8 @@  def jsonescape(s, paranoid=False):
     'this is a test'
     >>> jsonescape('escape characters: \\0 \\x0b \\x7f')
     'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
-    >>> jsonescape('escape characters: \\t \\n \\r \\" \\\\')
-    'escape characters: \\\\t \\\\n \\\\r \\\\" \\\\\\\\'
+    >>> jsonescape('escape characters: \\b \\t \\n \\f \\r \\" \\\\')
+    'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
     >>> jsonescape('a weird byte: \\xdd')
     'a weird byte: \\xed\\xb3\\x9d'
     >>> jsonescape('utf-8: caf\\xc3\\xa9')
@@ -416,6 +416,10 @@  def jsonescape(s, paranoid=False):
     If paranoid, non-ascii and common troublesome characters are also escaped.
     This is suitable for web output.
 
+    >>> s = 'escape characters: \\0 \\x0b \\x7f'
+    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
+    >>> s = 'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
+    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
     >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
     'escape boundary: ~ \\\\u007f \\\\u0080'
     >>> jsonescape('a weird byte: \\xdd', paranoid=True)
diff --git a/mercurial/policy.py b/mercurial/policy.py
--- a/mercurial/policy.py
+++ b/mercurial/policy.py
@@ -75,7 +75,7 @@  def _importfrom(pkgname, modname):
     (r'cext', r'diffhelpers'): 1,
     (r'cext', r'mpatch'): 1,
     (r'cext', r'osutil'): 1,
-    (r'cext', r'parsers'): 1,
+    (r'cext', r'parsers'): 2,
 }
 
 # map import request to other package or module