Patchwork [3,of,7] parsers: avoid PyList_Append when parsing obs markers

login
register
mail settings
Submitter Gregory Szorc
Date March 14, 2017, 5:15 a.m.
Message ID <6d3b13f243bea2c9200e.1489468545@ubuntu-vm-main>
Download mbox | patch
Permalink /patch/19319/
State Accepted
Headers show

Comments

Gregory Szorc - March 14, 2017, 5:15 a.m.
# HG changeset patch
# User Gregory Szorc <gregory.szorc@gmail.com>
# Date 1489454332 25200
#      Mon Mar 13 18:18:52 2017 -0700
# Node ID 6d3b13f243bea2c9200eecb827d7ec0ea771fc54
# Parent  3b997adb7efece7395fb585a70b2d70522626648
parsers: avoid PyList_Append when parsing obs markers

PyList_Append() has to continuously grow the underlying list as
available storage is exhausted. This adds overhead within tight
loops.

It is better to create a new list of known size then use the
PyList_SET_ITEM macro to populate each element of the list
without error checking. This commit switches us to that
mechanism.

The new code establishes an array to hold PyObject* instances
then iterates the array at the end. The bulk of the new code
is for memory management of the new array. There are some
inline comments explaining how the array is sized.

`hg perfloadmarkers` show an improvement with this change:

! result: 130264
! wall 0.125479 comb 0.130000 user 0.120000 sys 0.010000 (best of 76)
! wall 0.120487 comb 0.130000 user 0.120000 sys 0.010000 (best of 79)
Gregory Szorc - March 14, 2017, 5:24 a.m.
On Mon, Mar 13, 2017 at 10:15 PM, Gregory Szorc <gregory.szorc@gmail.com>
wrote:

> # HG changeset patch
> # User Gregory Szorc <gregory.szorc@gmail.com>
> # Date 1489454332 25200
> #      Mon Mar 13 18:18:52 2017 -0700
> # Node ID 6d3b13f243bea2c9200eecb827d7ec0ea771fc54
> # Parent  3b997adb7efece7395fb585a70b2d70522626648
> parsers: avoid PyList_Append when parsing obs markers
>
> PyList_Append() has to continuously grow the underlying list as
> available storage is exhausted. This adds overhead within tight
> loops.
>
> It is better to create a new list of known size then use the
> PyList_SET_ITEM macro to populate each element of the list
> without error checking. This commit switches us to that
> mechanism.
>
> The new code establishes an array to hold PyObject* instances
> then iterates the array at the end. The bulk of the new code
> is for memory management of the new array. There are some
> inline comments explaining how the array is sized.
>
> `hg perfloadmarkers` show an improvement with this change:
>
> ! result: 130264
> ! wall 0.125479 comb 0.130000 user 0.120000 sys 0.010000 (best of 76)
> ! wall 0.120487 comb 0.130000 user 0.120000 sys 0.010000 (best of 79)
>

This series makes obs marker loading (`hg perfloadmarkers`) ~4x faster.

However, on that same repo where this series made just loading ~95ms faster
(which is nothing to sneeze at), `hg log -r .` decreased from ~740ms to
~440ms. So I guess the lazy loading of obs markers has benefits to other
operations. Perhaps we're loading the obs store multiple times? Or perhaps
my series busted things so much in the RFC parts that the numbers are
misleading.

Assuming there is some truth to the numbers, holy cow the overhead of
PyObject creation is crazy. I suddenly feel like auditing the C code for
everywhere we're instantiating PyObjects before it is necessary. I know we
do this in the revlog index. I wonder if dirstate is impacted...


>
> diff --git a/mercurial/parsers.c b/mercurial/parsers.c
> --- a/mercurial/parsers.c
> +++ b/mercurial/parsers.c
> @@ -2792,33 +2792,77 @@ static PyObject *fm1readmarkers(PyObject
>         int datalen;
>         Py_ssize_t offset, stop;
>         PyObject *markers = NULL;
> +       PyObject **tmpmarkers = NULL;
> +       Py_ssize_t tmpmarkerssize;
> +       Py_ssize_t tmpmarkersoffset = 0;
>
>         if (!PyArg_ParseTuple(args, "s#nn", &data, &datalen, &offset,
> &stop)) {
>                 return NULL;
>         }
>         dataend = data + datalen;
>         data += offset;
> -       markers = PyList_New(0);
> -       if (!markers) {
> +
> +       /*
> +          We store an array of pointers to markers then build a list from
> +          it at the end. This is faster than appending to a PyList for
> each
> +          element.
> +
> +          Reallocating the array constantly could be expensive. We
> estimate
> +          the size of the array by assuming 90 bytes per marker, which is
> +          based on data seen in the wild.
> +       */
> +       tmpmarkerssize = datalen / 90;
> +       /* Handle datalen < 90 */
> +       tmpmarkerssize = tmpmarkerssize ? tmpmarkerssize : 10;
> +
> +       tmpmarkers = (PyObject **)PyMem_Malloc(
> +               tmpmarkerssize * sizeof(PyObject *));
> +       if (!tmpmarkers) {
> +               PyErr_NoMemory();
>                 return NULL;
>         }
> +
>         while (offset < stop) {
>                 uint32_t msize;
> -               int error;
> +               void *tmpbuffer;
>                 PyObject *record = fm1readmarker(data, dataend, &msize);
>                 if (!record) {
>                         goto bail;
>                 }
> -               error = PyList_Append(markers, record);
> -               Py_DECREF(record);
> -               if (error) {
> -                       goto bail;
> +
> +               /* Allocate more array storage if needed */
> +               if (tmpmarkersoffset + 1 == tmpmarkerssize) {
> +                       /* 5000 chosen somewhat arbitrarily. */
> +                       tmpmarkerssize += 5000;
> +                       tmpbuffer = PyMem_Realloc(tmpmarkers,
> +                               tmpmarkerssize * sizeof(PyObject*));
> +                       if (!tmpbuffer) {
> +                               PyErr_NoMemory();
> +                               goto bail;
> +                       }
> +                       tmpmarkers = (PyObject **)tmpbuffer;
>                 }
> +
> +               tmpmarkers[tmpmarkersoffset++] = record;
>                 data += msize;
>                 offset += msize;
>         }
> +
> +       /* Convert our array of pointers to a PyList */
> +       markers = PyList_New(tmpmarkersoffset);
> +       if (!markers) {
> +               goto bail;
> +       }
> +
> +       for (offset = 0; offset < tmpmarkersoffset; offset++) {
> +               PyList_SET_ITEM(markers, offset, tmpmarkers[offset]);
> +       }
> +
> +       free(tmpmarkers);
> +
>         return markers;
>  bail:
> +       free(tmpmarkers);
>         Py_DECREF(markers);
>         return NULL;
>  }
>
Yuya Nishihara - March 16, 2017, 2:20 p.m.
On Mon, 13 Mar 2017 22:15:45 -0700, Gregory Szorc wrote:
> # HG changeset patch
> # User Gregory Szorc <gregory.szorc@gmail.com>
> # Date 1489454332 25200
> #      Mon Mar 13 18:18:52 2017 -0700
> # Node ID 6d3b13f243bea2c9200eecb827d7ec0ea771fc54
> # Parent  3b997adb7efece7395fb585a70b2d70522626648
> parsers: avoid PyList_Append when parsing obs markers
> 
> PyList_Append() has to continuously grow the underlying list as
> available storage is exhausted. This adds overhead within tight
> loops.
> 
> It is better to create a new list of known size then use the
> PyList_SET_ITEM macro to populate each element of the list
> without error checking. This commit switches us to that
> mechanism.
> 
> The new code establishes an array to hold PyObject* instances
> then iterates the array at the end. The bulk of the new code
> is for memory management of the new array. There are some
> inline comments explaining how the array is sized.
> 
> `hg perfloadmarkers` show an improvement with this change:
> 
> ! result: 130264
> ! wall 0.125479 comb 0.130000 user 0.120000 sys 0.010000 (best of 76)
> ! wall 0.120487 comb 0.130000 user 0.120000 sys 0.010000 (best of 79)

I'm not so enthusiastic about this perf number compared to the complexity
of additional memory management. Does this contribute to the net perf win?

> --- a/mercurial/parsers.c
> +++ b/mercurial/parsers.c
> @@ -2792,33 +2792,77 @@ static PyObject *fm1readmarkers(PyObject
>  	int datalen;
>  	Py_ssize_t offset, stop;
>  	PyObject *markers = NULL;
> +	PyObject **tmpmarkers = NULL;
> +	Py_ssize_t tmpmarkerssize;
> +	Py_ssize_t tmpmarkersoffset = 0;
>  
>  	if (!PyArg_ParseTuple(args, "s#nn", &data, &datalen, &offset, &stop)) {
>  		return NULL;
>  	}
>  	dataend = data + datalen;
>  	data += offset;
> -	markers = PyList_New(0);
> -	if (!markers) {
> +
> +	/*
> +	   We store an array of pointers to markers then build a list from
> +	   it at the end. This is faster than appending to a PyList for each
> +	   element.
> +
> +	   Reallocating the array constantly could be expensive. We estimate
> +	   the size of the array by assuming 90 bytes per marker, which is
> +	   based on data seen in the wild.
> +	*/
> +	tmpmarkerssize = datalen / 90;
> +	/* Handle datalen < 90 */
> +	tmpmarkerssize = tmpmarkerssize ? tmpmarkerssize : 10;
> +
> +	tmpmarkers = (PyObject **)PyMem_Malloc(
> +		tmpmarkerssize * sizeof(PyObject *));
> +	if (!tmpmarkers) {
> +		PyErr_NoMemory();
>  		return NULL;
>  	}

Maybe we could do PyList_New(estimated_capacity) and shrink it at the end by
PyList_SetSlice(). The API doc doesn't state that PyList_SetSlice() can handle
NULL items, but builtin_zip() appears to do that.

  Python/bltinmodule.c:builtin_zip()

> +	free(tmpmarkers);

This should be PyMem_Free() for consistency.
Augie Fackler - March 16, 2017, 4:50 p.m.
On Mon, Mar 13, 2017 at 10:24:21PM -0700, Gregory Szorc wrote:
> On Mon, Mar 13, 2017 at 10:15 PM, Gregory Szorc <gregory.szorc@gmail.com>
> wrote:
> Assuming there is some truth to the numbers, holy cow the overhead of
> PyObject creation is crazy. I suddenly feel like auditing the C code for
> everywhere we're instantiating PyObjects before it is necessary. I know we
> do this in the revlog index. I wonder if dirstate is impacted...

Yeah, PyObject allocation can be pretty nasty. In my toy rust version
of fm1readmarkers, it's 100x faster than the C code was, then 3x
slower overall due to time spent copying data into PyObjects.
Gregory Szorc - March 16, 2017, 5:09 p.m.
On Thu, Mar 16, 2017 at 9:50 AM, Augie Fackler <raf@durin42.com> wrote:

> On Mon, Mar 13, 2017 at 10:24:21PM -0700, Gregory Szorc wrote:
> > On Mon, Mar 13, 2017 at 10:15 PM, Gregory Szorc <gregory.szorc@gmail.com
> >
> > wrote:
> > Assuming there is some truth to the numbers, holy cow the overhead of
> > PyObject creation is crazy. I suddenly feel like auditing the C code for
> > everywhere we're instantiating PyObjects before it is necessary. I know
> we
> > do this in the revlog index. I wonder if dirstate is impacted...
>
> Yeah, PyObject allocation can be pretty nasty. In my toy rust version
> of fm1readmarkers, it's 100x faster than the C code was, then 3x
> slower overall due to time spent copying data into PyObjects.
>

!!!

If it is 3x slower in the end, I highly suspect something sub-optimal with
the Rust -> PyObject code path. My guess would be having to allocate memory
and copy data for the hashes into PyBytes instances. We can leverage
memoryview or any object implementing the buffer protocol so we can avoid
that copy. You still pay for the PyObject overhead. But at least you aren't
making extra copies of data.

Also, I'm really curious when obs markers actually need to be realized.
There is validation after parsing that successors aren't nullid. That can
easily be inlined into C or deferred until first access. In theory, we
could create our own Python type behaving as a sequence that turned C
structs to PyObjects on demand. I'm just not sure if that buys us anything.
The second you iterate over markers from Python, you throw away that
optimization.

Perhaps it would be better for the obs marker API to conceptually be a
black box store that exposed primitives like "find X for <set>." That way,
all the expensive computation can be done in C on raw data structures and
we only realize PyObjects as needed. For cases where you are dealing with
lists of fixed-width values, you could return an array of packed data
instead of N PyObjects. Come to think of it, the computation of just hidden
nodes might be the only super critical part to be all in Python. If we can
do that in C without realizing tons PyObjects, I think a lot of read-only
operations/commands just got a lot faster.
Augie Fackler - March 16, 2017, 5:12 p.m.
> On Mar 16, 2017, at 10:09, Gregory Szorc <gregory.szorc@gmail.com> wrote:
> 
>> On Thu, Mar 16, 2017 at 9:50 AM, Augie Fackler <raf@durin42.com> wrote:
>> On Mon, Mar 13, 2017 at 10:24:21PM -0700, Gregory Szorc wrote:
>> > On Mon, Mar 13, 2017 at 10:15 PM, Gregory Szorc <gregory.szorc@gmail.com>
>> > wrote:
>> > Assuming there is some truth to the numbers, holy cow the overhead of
>> > PyObject creation is crazy. I suddenly feel like auditing the C code for
>> > everywhere we're instantiating PyObjects before it is necessary. I know we
>> > do this in the revlog index. I wonder if dirstate is impacted...
>> 
>> Yeah, PyObject allocation can be pretty nasty. In my toy rust version
>> of fm1readmarkers, it's 100x faster than the C code was, then 3x
>> slower overall due to time spent copying data into PyObjects.
>> 
> !!!

Yeah, the actual parsing step was so fast I almost didn't believe it. Zero-copy parsers are pretty great.

> 
> If it is 3x slower in the end, I highly suspect something sub-optimal with the Rust -> PyObject code path. My guess would be having to allocate memory and copy data for the hashes into PyBytes instances. We can leverage memoryview or any object implementing the buffer protocol so we can avoid that copy. You still pay for the PyObject overhead. But at least you aren't making extra copies of data.

My suspicion is that it's all the string copies, yeah. I also have wondered about the bindings I'm using doing something suboptimal with the GIL, but I haven't had time to wade through it or try and profile it.

> 
> Also, I'm really curious when obs markers actually need to be realized. There is validation after parsing that successors aren't nullid. That can easily be inlined into C or deferred until first access. In theory, we could create our own Python type behaving as a sequence that turned C structs to PyObjects on demand. I'm just not sure if that buys us anything. The second you iterate over markers from Python, you throw away that optimization. 
> 
> Perhaps it would be better for the obs marker API to conceptually be a black box store that exposed primitives like "find X for <set>." That way, all the expensive computation can be done in C on raw data structures and we only realize PyObjects as needed. For cases where you are dealing with lists of fixed-width values, you could return an array of packed data instead of N PyObjects. Come to think of it, the computation of just hidden nodes might be the only super critical part to be all in Python. If we can do that in C without realizing tons PyObjects, I think a lot of read-only operations/commands just got a lot faster.

If we build that interface, the rust code I've got sitting around (using nom) is probably actually worth exploring in more detail.
Gregory Szorc - March 16, 2017, 5:43 p.m.
On Thu, Mar 16, 2017 at 10:12 AM, Augie Fackler <raf@durin42.com> wrote:

>
> > On Mar 16, 2017, at 10:09, Gregory Szorc <gregory.szorc@gmail.com>
> wrote:
> >
> >> On Thu, Mar 16, 2017 at 9:50 AM, Augie Fackler <raf@durin42.com> wrote:
> >> On Mon, Mar 13, 2017 at 10:24:21PM -0700, Gregory Szorc wrote:
> >> > On Mon, Mar 13, 2017 at 10:15 PM, Gregory Szorc <
> gregory.szorc@gmail.com>
> >> > wrote:
> >> > Assuming there is some truth to the numbers, holy cow the overhead of
> >> > PyObject creation is crazy. I suddenly feel like auditing the C code
> for
> >> > everywhere we're instantiating PyObjects before it is necessary. I
> know we
> >> > do this in the revlog index. I wonder if dirstate is impacted...
> >>
> >> Yeah, PyObject allocation can be pretty nasty. In my toy rust version
> >> of fm1readmarkers, it's 100x faster than the C code was, then 3x
> >> slower overall due to time spent copying data into PyObjects.
> >>
> > !!!
>
> Yeah, the actual parsing step was so fast I almost didn't believe it.
> Zero-copy parsers are pretty great.
>
> >
> > If it is 3x slower in the end, I highly suspect something sub-optimal
> with the Rust -> PyObject code path. My guess would be having to allocate
> memory and copy data for the hashes into PyBytes instances. We can leverage
> memoryview or any object implementing the buffer protocol so we can avoid
> that copy. You still pay for the PyObject overhead. But at least you aren't
> making extra copies of data.
>
> My suspicion is that it's all the string copies, yeah. I also have
> wondered about the bindings I'm using doing something suboptimal with the
> GIL, but I haven't had time to wade through it or try and profile it.
>
> >
> > Also, I'm really curious when obs markers actually need to be realized.
> There is validation after parsing that successors aren't nullid. That can
> easily be inlined into C or deferred until first access. In theory, we
> could create our own Python type behaving as a sequence that turned C
> structs to PyObjects on demand. I'm just not sure if that buys us anything.
> The second you iterate over markers from Python, you throw away that
> optimization.
> >
> > Perhaps it would be better for the obs marker API to conceptually be a
> black box store that exposed primitives like "find X for <set>." That way,
> all the expensive computation can be done in C on raw data structures and
> we only realize PyObjects as needed. For cases where you are dealing with
> lists of fixed-width values, you could return an array of packed data
> instead of N PyObjects. Come to think of it, the computation of just hidden
> nodes might be the only super critical part to be all in Python. If we can
> do that in C without realizing tons PyObjects, I think a lot of read-only
> operations/commands just got a lot faster.
>
> If we build that interface, the rust code I've got sitting around (using
> nom) is probably actually worth exploring in more detail.


Last time we had the Rust discussion, we couldn't ship Rust due to distro
packaging support. Has that changed?

FWIW, Firefox appears to be driving a lot of distro support for Rust.
Firefox 54+ now require Rust to build. Furthermore, Firefox is pretty
aggressive about adopting new versions of Rust. What this means is that the
next Firefox ESR (after 52, which was released last week) will effectively
require distros to support whatever version of Rust that ESR is using. I'm
not exactly where distros are w.r.t. Rust support today. But the major ones
all know they need to get their Rust story in order for the next Firefox
ESR, which I think is sometime in 2018. Ask me over beers what those
conversations were like :)
Augie Fackler - March 16, 2017, 5:47 p.m.
> On Mar 16, 2017, at 10:43, Gregory Szorc <gregory.szorc@gmail.com> wrote:
> 
> Last time we had the Rust discussion, we couldn't ship Rust due to distro packaging support. Has that changed?

My gut feeling with my maintainer hat on is that any rust speedups would need to be opt-in, much like today's C speedups. My understanding of Rust is that it'll have significantly fewer ABI woes for our needs than C++ might.

> FWIW, Firefox appears to be driving a lot of distro support for Rust. Firefox 54+ now require Rust to build. Furthermore, Firefox is pretty aggressive about adopting new versions of Rust. What this means is that the next Firefox ESR (after 52, which was released last week) will effectively require distros to support whatever version of Rust that ESR is using. I'm not exactly where distros are w.r.t. Rust support today. But the major ones all know they need to get their Rust story in order for the next Firefox ESR, which I think is sometime in 2018. Ask me over beers what those conversations were like :)
Gregory Szorc - March 18, 2017, 5:06 p.m.
On Thu, Mar 16, 2017 at 10:47 AM, Augie Fackler <raf@durin42.com> wrote:

>
> On Mar 16, 2017, at 10:43, Gregory Szorc <gregory.szorc@gmail.com> wrote:
>
> Last time we had the Rust discussion, we couldn't ship Rust due to distro
> packaging support. Has that changed?
>
>
> My gut feeling with my maintainer hat on is that any rust speedups would
> need to be opt-in, much like today's C speedups. My understanding of Rust
> is that it'll have significantly fewer ABI woes for our needs than C++
> might.
>

I also agree I'd rather start using Rust before C++.

It's worth noting that while our C speedups are strictly speaking optional,
setup.py tries to build them by default and fails if it can't. Furthermore,
our default module import policy requires the C extensions by default. This
isn't a huge problem today because many systems can build Python C
extensions (or at least can easily after installing a python-dev or
python-devel package). But Rust support in distros isn't quite there. So I
fear requiring Rust today will make it more difficult to get the speedups
than the corresponding C versions.


>
> FWIW, Firefox appears to be driving a lot of distro support for Rust.
> Firefox 54+ now require Rust to build. Furthermore, Firefox is pretty
> aggressive about adopting new versions of Rust. What this means is that the
> next Firefox ESR (after 52, which was released last week) will effectively
> require distros to support whatever version of Rust that ESR is using. I'm
> not exactly where distros are w.r.t. Rust support today. But the major ones
> all know they need to get their Rust story in order for the next Firefox
> ESR, which I think is sometime in 2018. Ask me over beers what those
> conversations were like :)
>
>

Patch

diff --git a/mercurial/parsers.c b/mercurial/parsers.c
--- a/mercurial/parsers.c
+++ b/mercurial/parsers.c
@@ -2792,33 +2792,77 @@  static PyObject *fm1readmarkers(PyObject
 	int datalen;
 	Py_ssize_t offset, stop;
 	PyObject *markers = NULL;
+	PyObject **tmpmarkers = NULL;
+	Py_ssize_t tmpmarkerssize;
+	Py_ssize_t tmpmarkersoffset = 0;
 
 	if (!PyArg_ParseTuple(args, "s#nn", &data, &datalen, &offset, &stop)) {
 		return NULL;
 	}
 	dataend = data + datalen;
 	data += offset;
-	markers = PyList_New(0);
-	if (!markers) {
+
+	/*
+	   We store an array of pointers to markers then build a list from
+	   it at the end. This is faster than appending to a PyList for each
+	   element.
+
+	   Reallocating the array constantly could be expensive. We estimate
+	   the size of the array by assuming 90 bytes per marker, which is
+	   based on data seen in the wild.
+	*/
+	tmpmarkerssize = datalen / 90;
+	/* Handle datalen < 90 */
+	tmpmarkerssize = tmpmarkerssize ? tmpmarkerssize : 10;
+
+	tmpmarkers = (PyObject **)PyMem_Malloc(
+		tmpmarkerssize * sizeof(PyObject *));
+	if (!tmpmarkers) {
+		PyErr_NoMemory();
 		return NULL;
 	}
+
 	while (offset < stop) {
 		uint32_t msize;
-		int error;
+		void *tmpbuffer;
 		PyObject *record = fm1readmarker(data, dataend, &msize);
 		if (!record) {
 			goto bail;
 		}
-		error = PyList_Append(markers, record);
-		Py_DECREF(record);
-		if (error) {
-			goto bail;
+
+		/* Allocate more array storage if needed */
+		if (tmpmarkersoffset + 1 == tmpmarkerssize) {
+			/* 5000 chosen somewhat arbitrarily. */
+			tmpmarkerssize += 5000;
+			tmpbuffer = PyMem_Realloc(tmpmarkers,
+				tmpmarkerssize * sizeof(PyObject*));
+			if (!tmpbuffer) {
+				PyErr_NoMemory();
+				goto bail;
+			}
+			tmpmarkers = (PyObject **)tmpbuffer;
 		}
+
+		tmpmarkers[tmpmarkersoffset++] = record;
 		data += msize;
 		offset += msize;
 	}
+
+	/* Convert our array of pointers to a PyList */
+	markers = PyList_New(tmpmarkersoffset);
+	if (!markers) {
+		goto bail;
+	}
+
+	for (offset = 0; offset < tmpmarkersoffset; offset++) {
+		PyList_SET_ITEM(markers, offset, tmpmarkers[offset]);
+	}
+
+	free(tmpmarkers);
+
 	return markers;
 bail:
+	free(tmpmarkers);
 	Py_DECREF(markers);
 	return NULL;
 }