Patchwork [6,of,6] dirs: use mutable strings internally

login
register
mail settings
Submitter Bryan O'Sullivan
Date April 1, 2013, 8:49 p.m.
Message ID <65a67c7aefed9d86aa24.1364849359@australite.local>
Download mbox | patch
Permalink /patch/1236/
State Accepted, archived
Headers show

Comments

Bryan O'Sullivan - April 1, 2013, 8:49 p.m.
# HG changeset patch
# User Bryan O'Sullivan <bryano@fb.com>
# Date 1364849337 25200
#      Mon Apr 01 13:48:57 2013 -0700
# Node ID 65a67c7aefed9d86aa24758aaa7529445856db9f
# Parent  34666bf45a79c688f24fe15ce6a73894a6adfee6
dirs: use mutable strings internally

perfdirs results for a working dir with 170,000 files:
  Python     638 msec
  C          244
  C+int      192
  C+int+str  168

In the large repo above, the nearly 0.5 second time improvement is
visible in commands like "hg add" and "hg update".

hg add
  Python    1100 msec
  C+int+str  600

hg update (with nothing to do)
  Python    2800 msec
  C+int+str 2240
Bryan O'Sullivan - April 1, 2013, 9:15 p.m.
On Mon, Apr 1, 2013 at 1:49 PM, Bryan O'Sullivan <bos@serpentine.com> wrote:

> hg update (with nothing to do)
>   Python    2800 msec
>   C+int+str 2240
>

This measurement is incorrect - there are cases where update will speed up
by this amount, but I simply measured the wrong baseline here.
Augie Fackler - April 10, 2013, 8:33 p.m.
On Mon, Apr 01, 2013 at 01:49:19PM -0700, Bryan O'Sullivan wrote:
> # HG changeset patch
> # User Bryan O'Sullivan <bryano@fb.com>
> # Date 1364849337 25200
> #      Mon Apr 01 13:48:57 2013 -0700
> # Node ID 65a67c7aefed9d86aa24758aaa7529445856db9f
> # Parent  34666bf45a79c688f24fe15ce6a73894a6adfee6
> dirs: use mutable strings internally

Other than the doc/naming nit Kevin and I had on patch 2 (I think),
and the correction in the log message here, I'm happy with this
series. Resend and/or push it?

>
> perfdirs results for a working dir with 170,000 files:
>   Python     638 msec
>   C          244
>   C+int      192
>   C+int+str  168
>
> In the large repo above, the nearly 0.5 second time improvement is
> visible in commands like "hg add" and "hg update".
>
> hg add
>   Python    1100 msec
>   C+int+str  600
>
> hg update (with nothing to do)
>   Python    2800 msec
>   C+int+str 2240
>
> diff --git a/mercurial/dirs.c b/mercurial/dirs.c
> --- a/mercurial/dirs.c
> +++ b/mercurial/dirs.c
> @@ -16,6 +16,9 @@
>   *
>   * We modify Python integers for refcounting, but those integers are
>   * never visible to Python code.
> + *
> + * We mutate strings in-place, but leave them immutable once they can
> + * be seen by Python code.
>   */
>
>  typedef struct {
> @@ -38,6 +41,7 @@ static inline Py_ssize_t _finddir(PyObje
>
>  static int _addpath(PyObject *dirs, PyObject *path)
>  {
> +	const char *cpath = PyString_AS_STRING(path);
>       Py_ssize_t pos = PyString_GET_SIZE(path);
>       PyObject *key = NULL;
>       int ret = -1;
> @@ -45,15 +49,24 @@ static int _addpath(PyObject *dirs, PyOb
>       while ((pos = _finddir(path, pos - 1)) != -1) {
>               PyObject *val;
>
> -		key = PyString_FromStringAndSize(PyString_AS_STRING(path), pos);
> -
> -		if (key == NULL)
> -			goto bail;
> +		/* It's likely that every prefix already has an entry
> +                in our dict. Try to avoid allocating and
> +                deallocating a string for each prefix we check. */
> +		if (key != NULL)
> +			((PyStringObject *)key)->ob_shash = -1;
> +		else {
> +			/* Force Python to not reuse a small shared string. */
> +			key = PyString_FromStringAndSize(cpath,
> +                                                      pos < 2 ? 2 : pos);
> +			if (key == NULL)
> +				goto bail;
> +		}
> +		PyString_GET_SIZE(key) = pos;
> +		PyString_AS_STRING(key)[pos] = '\0';
>
>               val = PyDict_GetItem(dirs, key);
>               if (val != NULL) {
>                       PyInt_AS_LONG(val) += 1;
> -			Py_CLEAR(key);
>                       continue;
>               }
>
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel

Patch

diff --git a/mercurial/dirs.c b/mercurial/dirs.c
--- a/mercurial/dirs.c
+++ b/mercurial/dirs.c
@@ -16,6 +16,9 @@ 
  *
  * We modify Python integers for refcounting, but those integers are
  * never visible to Python code.
+ *
+ * We mutate strings in-place, but leave them immutable once they can
+ * be seen by Python code.
  */
 
 typedef struct {
@@ -38,6 +41,7 @@  static inline Py_ssize_t _finddir(PyObje
 
 static int _addpath(PyObject *dirs, PyObject *path)
 {
+	const char *cpath = PyString_AS_STRING(path);
 	Py_ssize_t pos = PyString_GET_SIZE(path);
 	PyObject *key = NULL;
 	int ret = -1;
@@ -45,15 +49,24 @@  static int _addpath(PyObject *dirs, PyOb
 	while ((pos = _finddir(path, pos - 1)) != -1) {
 		PyObject *val;
 
-		key = PyString_FromStringAndSize(PyString_AS_STRING(path), pos);
-
-		if (key == NULL)
-			goto bail;
+		/* It's likely that every prefix already has an entry
+		   in our dict. Try to avoid allocating and
+		   deallocating a string for each prefix we check. */
+		if (key != NULL)
+			((PyStringObject *)key)->ob_shash = -1;
+		else {
+			/* Force Python to not reuse a small shared string. */
+			key = PyString_FromStringAndSize(cpath,
+							 pos < 2 ? 2 : pos);
+			if (key == NULL)
+				goto bail;
+		}
+		PyString_GET_SIZE(key) = pos;
+		PyString_AS_STRING(key)[pos] = '\0';
 
 		val = PyDict_GetItem(dirs, key);
 		if (val != NULL) {
 			PyInt_AS_LONG(val) += 1;
-			Py_CLEAR(key);
 			continue;
 		}