Patchwork [3,of,5] encoding: add function to test if a str consists of ASCII characters

login
register
mail settings
Submitter Yuya Nishihara
Date Aug. 18, 2017, 2:14 p.m.
Message ID <394f90f7dacb585c23ec.1503065651@mimosa>
Download mbox | patch
Permalink /patch/23115/
State Accepted
Headers show

Comments

Yuya Nishihara - Aug. 18, 2017, 2:14 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1492919982 -32400
#      Sun Apr 23 12:59:42 2017 +0900
# Node ID 394f90f7dacb585c23ec5efa8958c37620ce2418
# Parent  a72635d28242bec2c5165ef938bc4abc4a23e7b4
encoding: add function to test if a str consists of ASCII characters

Most strings are ASCII. Let's optimize for it.

Using uint64_t is slightly faster than uint32_t on 64bit system, but there
isn't huge difference.

Patch

diff --git a/contrib/python3-whitelist b/contrib/python3-whitelist
--- a/contrib/python3-whitelist
+++ b/contrib/python3-whitelist
@@ -18,6 +18,7 @@  test-doctest.py
 test-duplicateoptions.py
 test-empty-dir.t
 test-empty.t
+test-encoding-func.py
 test-excessive-merge.t
 test-hghave.t
 test-issue1089.t
diff --git a/mercurial/cext/charencode.c b/mercurial/cext/charencode.c
--- a/mercurial/cext/charencode.c
+++ b/mercurial/cext/charencode.c
@@ -12,6 +12,7 @@ 
 #include <assert.h>
 
 #include "charencode.h"
+#include "compat.h"
 #include "util.h"
 
 #ifdef IS_PY3K
@@ -125,6 +126,29 @@  PyObject *unhexlify(const char *str, Py_
 	return ret;
 }
 
+PyObject *isasciistr(PyObject *self, PyObject *args)
+{
+	const char *buf;
+	Py_ssize_t i, len;
+	if (!PyArg_ParseTuple(args, "s#:isasciistr", &buf, &len))
+		return NULL;
+	i = 0;
+	/* char array in PyStringObject should be at least 4-byte aligned */
+	if (((uintptr_t)buf & 3) == 0) {
+		const uint32_t *p = (const uint32_t *)buf;
+		for (; i < len / 4; i++) {
+			if (p[i] & 0x80808080U)
+				Py_RETURN_FALSE;
+		}
+		i *= 4;
+	}
+	for (; i < len; i++) {
+		if (buf[i] & 0x80)
+			Py_RETURN_FALSE;
+	}
+	Py_RETURN_TRUE;
+}
+
 static inline PyObject *_asciitransform(PyObject *str_obj,
 					const char table[128],
 					PyObject *fallback_fn)
diff --git a/mercurial/cext/charencode.h b/mercurial/cext/charencode.h
--- a/mercurial/cext/charencode.h
+++ b/mercurial/cext/charencode.h
@@ -19,6 +19,7 @@  enum normcase_spec {
 };
 
 PyObject *unhexlify(const char *str, Py_ssize_t len);
+PyObject *isasciistr(PyObject *self, PyObject *args);
 PyObject *asciilower(PyObject *self, PyObject *args);
 PyObject *asciiupper(PyObject *self, PyObject *args);
 PyObject *make_file_foldmap(PyObject *self, PyObject *args);
diff --git a/mercurial/cext/parsers.c b/mercurial/cext/parsers.c
--- a/mercurial/cext/parsers.c
+++ b/mercurial/cext/parsers.c
@@ -696,6 +696,7 @@  static PyMethodDef methods[] = {
 	{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
 	{"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
 	{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
+	{"isasciistr", isasciistr, METH_VARARGS, "check if an ASCII string\n"},
 	{"asciilower", asciilower, METH_VARARGS, "lowercase an ASCII string\n"},
 	{"asciiupper", asciiupper, METH_VARARGS, "uppercase an ASCII string\n"},
 	{"dict_new_presized", dict_new_presized, METH_VARARGS,
@@ -716,7 +717,7 @@  void dirs_module_init(PyObject *mod);
 void manifest_module_init(PyObject *mod);
 void revlog_module_init(PyObject *mod);
 
-static const int version = 2;
+static const int version = 3;
 
 static void module_init(PyObject *mod)
 {
diff --git a/mercurial/compat.h b/mercurial/compat.h
--- a/mercurial/compat.h
+++ b/mercurial/compat.h
@@ -7,8 +7,10 @@ 
 #define inline __inline
 #if defined(_WIN64)
 typedef __int64 ssize_t;
+typedef unsigned __int64 uintptr_t;
 #else
 typedef int ssize_t;
+typedef unsigned int uintptr_t;
 #endif
 typedef signed char int8_t;
 typedef short int16_t;
diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -24,6 +24,7 @@  from .pure import (
 
 charencode = policy.importmod(r'charencode')
 
+isasciistr = charencode.isasciistr
 asciilower = charencode.asciilower
 asciiupper = charencode.asciiupper
 _jsonescapeu8fast = charencode.jsonescapeu8fast
diff --git a/mercurial/policy.py b/mercurial/policy.py
--- a/mercurial/policy.py
+++ b/mercurial/policy.py
@@ -75,7 +75,7 @@  def _importfrom(pkgname, modname):
     (r'cext', r'diffhelpers'): 1,
     (r'cext', r'mpatch'): 1,
     (r'cext', r'osutil'): 1,
-    (r'cext', r'parsers'): 2,
+    (r'cext', r'parsers'): 3,
 }
 
 # map import request to other package or module
diff --git a/mercurial/pure/charencode.py b/mercurial/pure/charencode.py
--- a/mercurial/pure/charencode.py
+++ b/mercurial/pure/charencode.py
@@ -13,6 +13,13 @@  from .. import (
     pycompat,
 )
 
+def isasciistr(s):
+    try:
+        s.decode('ascii')
+        return True
+    except UnicodeDecodeError:
+        return False
+
 def asciilower(s):
     '''convert a string to lowercase if ASCII
 
diff --git a/tests/test-encoding-func.py b/tests/test-encoding-func.py
new file mode 100644
--- /dev/null
+++ b/tests/test-encoding-func.py
@@ -0,0 +1,33 @@ 
+from __future__ import absolute_import
+
+import unittest
+
+from mercurial import (
+    encoding,
+)
+
+class IsasciistrTest(unittest.TestCase):
+    asciistrs = [
+        b'a',
+        b'ab',
+        b'abc',
+        b'abcd',
+        b'abcde',
+        b'abcdefghi',
+        b'abcd\0fghi',
+    ]
+
+    def testascii(self):
+        for s in self.asciistrs:
+            self.assertTrue(encoding.isasciistr(s))
+
+    def testnonasciichar(self):
+        for s in self.asciistrs:
+            for i in range(len(s)):
+                t = bytearray(s)
+                t[i] |= 0x80
+                self.assertFalse(encoding.isasciistr(bytes(t)))
+
+if __name__ == '__main__':
+    import silenttestrunner
+    silenttestrunner.main(__name__)