Patchwork [1,of,2,V2] perf: introduce a perfrevlogwrite command

login
register
mail settings
Submitter Boris Feld
Date Oct. 19, 2018, 12:25 p.m.
Message ID <78ffc55c0d6034972ea3.1539951913@localhost.localdomain>
Download mbox | patch
Permalink /patch/36168/
State Superseded
Headers show

Comments

Boris Feld - Oct. 19, 2018, 12:25 p.m.
# HG changeset patch
# User Boris Feld <boris.feld@octobus.net>
# Date 1538556809 -7200
#      Wed Oct 03 10:53:29 2018 +0200
# Node ID 78ffc55c0d6034972ea3d49b8c565c563ef27121
# Parent  824b687ff6af49622a18dfea760cf41a7abe5aa7
# EXP-Topic revlog-perf
# Available At https://bitbucket.org/octobus/mercurial-devel/
#              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 78ffc55c0d60
perf: introduce a perfrevlogwrite command

The command record times taken by adding many revisions to a revlog. Timing
each addition, individually. The "added revision" are recreations of the
original ones.

To time each addition individually, we have to handle the timing and the
reporting ourselves.

This command is introduced to track the impact of sparse-revlog format on
delta computations at initial storage time. It starts with the full text, a
situation similar to the "commit". Additions from an existing delta are better
timed with bundles.

The complaints from `check-perf-code.py` are not relevant. We are accessing
and "revlog" opener, not a repository opener.
Augie Fackler - Oct. 22, 2018, 3:54 p.m.
On Fri, Oct 19, 2018 at 02:25:13PM +0200, Boris Feld wrote:
> # HG changeset patch
> # User Boris Feld <boris.feld@octobus.net>
> # Date 1538556809 -7200
> #      Wed Oct 03 10:53:29 2018 +0200
> # Node ID 78ffc55c0d6034972ea3d49b8c565c563ef27121
> # Parent  824b687ff6af49622a18dfea760cf41a7abe5aa7
> # EXP-Topic revlog-perf
> # Available At https://bitbucket.org/octobus/mercurial-devel/
> #              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 78ffc55c0d60
> perf: introduce a perfrevlogwrite command

Unless you can give an /extremely/ compelling reason, I'm going to
punt this to 4.9.

>
> The command record times taken by adding many revisions to a revlog. Timing
> each addition, individually. The "added revision" are recreations of the
> original ones.
>
> To time each addition individually, we have to handle the timing and the
> reporting ourselves.
>
> This command is introduced to track the impact of sparse-revlog format on
> delta computations at initial storage time. It starts with the full text, a
> situation similar to the "commit". Additions from an existing delta are better
> timed with bundles.
>
> The complaints from `check-perf-code.py` are not relevant. We are accessing
> and "revlog" opener, not a repository opener.
>
> diff --git a/contrib/perf.py b/contrib/perf.py
> --- a/contrib/perf.py
> +++ b/contrib/perf.py
> @@ -24,8 +24,10 @@ import functools
>  import gc
>  import os
>  import random
> +import shutil
>  import struct
>  import sys
> +import tempfile
>  import threading
>  import time
>  from mercurial import (
> @@ -1565,6 +1567,149 @@ def perfrevlogrevisions(ui, repo, file_=
>      timer(d)
>      fm.end()
>
> +@command(b'perfrevlogwrite', revlogopts + formatteropts +
> +         [(b's', b'startrev', 1000, b'revision to start writing at'),
> +          (b'', b'stoprev', -1, b'last revision to write'),
> +          (b'', b'count', 3, b'last revision to write'),
> +         ],
> +         b'-c|-m|FILE')
> +def perfrevlogwrite(ui, repo, file_=None, startrev=0, stoprev=-1, **opts):
> +    """Benchmark writing a series of revisions to a revlog.
> +    """
> +    opts = _byteskwargs(opts)
> +
> +    rl = cmdutil.openrevlog(repo, b'perfrevlogwrite', file_, opts)
> +    rllen = getlen(ui)(rl)
> +    if startrev < 0:
> +        startrev = rllen + startrev
> +    if stoprev < 0:
> +        stoprev = rllen + stoprev
> +
> +    ### actually gather results
> +    count = opts['count']
> +    if count <= 0:
> +        raise error.Abort('invalide run count: %d' % count)

typo: invalid (spurious e on the end in yours)

> +    allresults = []
> +    for c in range(count):
> +        allresults.append(_timeonewrite(ui, rl, startrev, stoprev, c + 1))
> +
> +    ### consolidate the results in a single list
> +    results = []
> +    for idx, (rev, t) in enumerate(allresults[0]):
> +        ts = [t]
> +        for other in allresults[1:]:
> +            orev, ot = other[idx]
> +            assert orev == rev
> +            ts.append(ot)
> +        results.append((rev, ts))
> +    resultcount = len(results)
> +
> +    ### Compute and display relevant statistics
> +
> +    # get a formatter
> +    fm = ui.formatter(b'perf', opts)
> +    displayall = ui.configbool(b"perf", b"all-timing", False)
> +
> +    # sorts results by median time
> +    results.sort(key=lambda x: sorted(x[1])[len(x[1]) // 2])
> +    # list of (name, index) to display)
> +    relevants = [
> +        ("min", 0),
> +        ("10%", resultcount * 10 // 100),
> +        ("25%", resultcount * 25 // 100),
> +        ("50%", resultcount * 70 // 100),
> +        ("75%", resultcount * 75 // 100),
> +        ("90%", resultcount * 90 // 100),
> +        ("95%", resultcount * 95 // 100),
> +        ("99%", resultcount * 99 // 100),
> +        ("max", -1),
> +    ]

Clarifying what this block does: you sort results by median time
(median of what? I'm a little lost) and then use integer division to
jump into the result set at the right position for emulating the
various %ile latencies, right?

Some sample output in the commit message would be nice.

> +    for name, idx in relevants:
> +        data = results[idx]
> +        title = '%s of %d, rev %d' % (name, resultcount, data[0])
> +        formatone(fm, data[1], title=title, displayall=displayall)
> +
> +    # XXX summing that many float will not be very precise, we ignore this fact
> +    # for now

Make this a TODO, or just a NOTE? it seems extremely unlikely we'll
ever do it, so maybe just make it "note that summing this many flaots
will ..." and make peace with that?

> +    totaltime = []
> +    for item in allresults:
> +        totaltime.append((sum(x[1][0] for x in item),
> +                          sum(x[1][1] for x in item),
> +                          sum(x[1][2] for x in item),)
> +        )
> +    formatone(fm, totaltime, title="total time (%d revs)" % resultcount,
> +              displayall=displayall)
> +    fm.end()
> +
> +def _timeonewrite(ui, orig, startrev, stoprev, runidx=None):
> +    timings = []
> +    tr = type('faketr', (object,) , {'add': lambda s, x, y, z=None:None})()
> +    with _temprevlog(ui, orig, startrev) as dest:
> +        revs = list(orig.revs(startrev, stoprev))
> +        total = len(revs)
> +        topic = 'adding'
> +        if runidx is not None:
> +            topic += ' (run #%d)' % runidx
> +        for idx, rev in enumerate(revs):
> +            ui.progress(topic, idx, unit='revs', total=total)
> +            text = orig.revision(rev)
> +            linkrev = orig.linkrev(rev)
> +            node = orig.node(rev)
> +            p1, p2 = orig.parents(node)
> +            with timeone() as r:
> +                dest.addrevision(text, tr, linkrev, p1, p2, node=node)
> +            timings.append((rev, r[0]))
> +        ui.progress(topic, total, unit='revs', total=total)
> +        ui.progress(topic, None, unit='revs', total=total)
> +    return timings
> +
> +@contextlib.contextmanager
> +def _temprevlog(ui, orig, truncaterev):
> +    from mercurial import vfs as vfsmod
> +
> +    if orig._inline:
> +        raise error.Abort('not supporting inline revlog (yet)')
> +
> +    origindexpath = orig.opener.join(orig.indexfile)
> +    origdatapath = orig.opener.join(orig.datafile)
> +    indexname = 'revlog.i'
> +    dataname = 'revlog.d'
> +
> +    tmpdir = tempfile.mkdtemp(prefix='tmp-hgperf-')
> +    try:
> +        # copy the data file in a temporary directory
> +        ui.debug('copying data in %s\n' % tmpdir)
> +        destindexpath = os.path.join(tmpdir, 'revlog.i')
> +        destdatapath = os.path.join(tmpdir, 'revlog.d')
> +        shutil.copyfile(origindexpath, destindexpath)
> +        shutil.copyfile(origdatapath, destdatapath)
> +
> +        # remove the data we want to add again
> +        ui.debug('truncating data to be rewritten\n')
> +        with open(destindexpath, 'ab') as index:
> +            index.seek(0)
> +            index.truncate(truncaterev * orig._io.size)
> +        with open(destdatapath, 'ab') as data:
> +            data.seek(0)
> +            data.truncate(orig.start(truncaterev))
> +
> +        # instantiate a new revlog from the temporary copy
> +        ui.debug('truncating adding to be rewritten\n')
> +        vfs = vfsmod.vfs(tmpdir)
> +        vfs.options = getattr(orig.opener, 'options', None)
> +
> +        dest = revlog.revlog(vfs,
> +                             indexfile=indexname,
> +                             datafile=dataname)
> +        if dest._inline:
> +            raise error.Abort('not supporting inline revlog (yet)')
> +        # make sure internals are initialized
> +        dest.revision(len(dest) - 1)
> +        yield dest
> +        del dest, vfs
> +    finally:
> +        shutil.rmtree(tmpdir, True)
> +
>  @command(b'perfrevlogchunks', revlogopts + formatteropts +
>           [(b'e', b'engines', b'', b'compression engines to use'),
>            (b's', b'startrev', 0, b'revision to start at')],
> diff --git a/tests/test-contrib-perf.t b/tests/test-contrib-perf.t
> --- a/tests/test-contrib-perf.t
> +++ b/tests/test-contrib-perf.t
> @@ -114,6 +114,8 @@ perfstatus
>                   Benchmark obtaining a revlog revision.
>     perfrevlogrevisions
>                   Benchmark reading a series of revisions from a revlog.
> +   perfrevlogwrite
> +                 Benchmark writing a series of revisions to a revlog.
>     perfrevrange  (no help text available)
>     perfrevset    benchmark the execution time of a revset
>     perfstartup   (no help text available)
> @@ -265,4 +267,16 @@ Check perf.py for historical portability
>    contrib/perf.py:\d+: (re)
>     >     from mercurial import (
>     import newer module separately in try clause for early Mercurial
> +  contrib/perf.py:\d+: (re)
> +   >     origindexpath = orig.opener.join(orig.indexfile)
> +   use getvfs()/getsvfs() for early Mercurial
> +  contrib/perf.py:\d+: (re)
> +   >     origdatapath = orig.opener.join(orig.datafile)
> +   use getvfs()/getsvfs() for early Mercurial
> +  contrib/perf.py:\d+: (re)
> +   >         vfs = vfsmod.vfs(tmpdir)
> +   use getvfs()/getsvfs() for early Mercurial
> +  contrib/perf.py:\d+: (re)
> +   >         vfs.options = getattr(orig.opener, 'options', None)
> +   use getvfs()/getsvfs() for early Mercurial
>    [1]
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Yuya Nishihara - Oct. 25, 2018, 11:24 a.m.
On Fri, 19 Oct 2018 14:25:13 +0200, Boris Feld wrote:
> # HG changeset patch
> # User Boris Feld <boris.feld@octobus.net>
> # Date 1538556809 -7200
> #      Wed Oct 03 10:53:29 2018 +0200
> # Node ID 78ffc55c0d6034972ea3d49b8c565c563ef27121
> # Parent  824b687ff6af49622a18dfea760cf41a7abe5aa7
> # EXP-Topic revlog-perf
> # Available At https://bitbucket.org/octobus/mercurial-devel/
> #              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 78ffc55c0d60
> perf: introduce a perfrevlogwrite command

Looks generally good.

> +@command(b'perfrevlogwrite', revlogopts + formatteropts +
> +         [(b's', b'startrev', 1000, b'revision to start writing at'),
> +          (b'', b'stoprev', -1, b'last revision to write'),
> +          (b'', b'count', 3, b'last revision to write'),
> +         ],
> +         b'-c|-m|FILE')
> +def perfrevlogwrite(ui, repo, file_=None, startrev=0, stoprev=-1, **opts):

Which is the default startrev value? 1000 or 0?

> +def _timeonewrite(ui, orig, startrev, stoprev, runidx=None):
> +    timings = []
> +    tr = type('faketr', (object,) , {'add': lambda s, x, y, z=None:None})()

Nit: I prefer using 'class faketr' syntax.
Boris Feld - Oct. 26, 2018, 12:26 p.m.
On 25/10/2018 13:24, Yuya Nishihara wrote:
> On Fri, 19 Oct 2018 14:25:13 +0200, Boris Feld wrote:
>> # HG changeset patch
>> # User Boris Feld <boris.feld@octobus.net>
>> # Date 1538556809 -7200
>> #      Wed Oct 03 10:53:29 2018 +0200
>> # Node ID 78ffc55c0d6034972ea3d49b8c565c563ef27121
>> # Parent  824b687ff6af49622a18dfea760cf41a7abe5aa7
>> # EXP-Topic revlog-perf
>> # Available At https://bitbucket.org/octobus/mercurial-devel/
>> #              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 78ffc55c0d60
>> perf: introduce a perfrevlogwrite command
> Looks generally good.
>
>> +@command(b'perfrevlogwrite', revlogopts + formatteropts +
>> +         [(b's', b'startrev', 1000, b'revision to start writing at'),
>> +          (b'', b'stoprev', -1, b'last revision to write'),
>> +          (b'', b'count', 3, b'last revision to write'),
>> +         ],
>> +         b'-c|-m|FILE')
>> +def perfrevlogwrite(ui, repo, file_=None, startrev=0, stoprev=-1, **opts):
> Which is the default startrev value? 1000 or 0?

The default is 1000, we should update the function default value.

We will send a V3, with an explicit class declaration.

Patch

diff --git a/contrib/perf.py b/contrib/perf.py
--- a/contrib/perf.py
+++ b/contrib/perf.py
@@ -24,8 +24,10 @@  import functools
 import gc
 import os
 import random
+import shutil
 import struct
 import sys
+import tempfile
 import threading
 import time
 from mercurial import (
@@ -1565,6 +1567,149 @@  def perfrevlogrevisions(ui, repo, file_=
     timer(d)
     fm.end()
 
+@command(b'perfrevlogwrite', revlogopts + formatteropts +
+         [(b's', b'startrev', 1000, b'revision to start writing at'),
+          (b'', b'stoprev', -1, b'last revision to write'),
+          (b'', b'count', 3, b'last revision to write'),
+         ],
+         b'-c|-m|FILE')
+def perfrevlogwrite(ui, repo, file_=None, startrev=0, stoprev=-1, **opts):
+    """Benchmark writing a series of revisions to a revlog.
+    """
+    opts = _byteskwargs(opts)
+
+    rl = cmdutil.openrevlog(repo, b'perfrevlogwrite', file_, opts)
+    rllen = getlen(ui)(rl)
+    if startrev < 0:
+        startrev = rllen + startrev
+    if stoprev < 0:
+        stoprev = rllen + stoprev
+
+    ### actually gather results
+    count = opts['count']
+    if count <= 0:
+        raise error.Abort('invalide run count: %d' % count)
+    allresults = []
+    for c in range(count):
+        allresults.append(_timeonewrite(ui, rl, startrev, stoprev, c + 1))
+
+    ### consolidate the results in a single list
+    results = []
+    for idx, (rev, t) in enumerate(allresults[0]):
+        ts = [t]
+        for other in allresults[1:]:
+            orev, ot = other[idx]
+            assert orev == rev
+            ts.append(ot)
+        results.append((rev, ts))
+    resultcount = len(results)
+
+    ### Compute and display relevant statistics
+
+    # get a formatter
+    fm = ui.formatter(b'perf', opts)
+    displayall = ui.configbool(b"perf", b"all-timing", False)
+
+    # sorts results by median time
+    results.sort(key=lambda x: sorted(x[1])[len(x[1]) // 2])
+    # list of (name, index) to display)
+    relevants = [
+        ("min", 0),
+        ("10%", resultcount * 10 // 100),
+        ("25%", resultcount * 25 // 100),
+        ("50%", resultcount * 70 // 100),
+        ("75%", resultcount * 75 // 100),
+        ("90%", resultcount * 90 // 100),
+        ("95%", resultcount * 95 // 100),
+        ("99%", resultcount * 99 // 100),
+        ("max", -1),
+    ]
+    for name, idx in relevants:
+        data = results[idx]
+        title = '%s of %d, rev %d' % (name, resultcount, data[0])
+        formatone(fm, data[1], title=title, displayall=displayall)
+
+    # XXX summing that many float will not be very precise, we ignore this fact
+    # for now
+    totaltime = []
+    for item in allresults:
+        totaltime.append((sum(x[1][0] for x in item),
+                          sum(x[1][1] for x in item),
+                          sum(x[1][2] for x in item),)
+        )
+    formatone(fm, totaltime, title="total time (%d revs)" % resultcount,
+              displayall=displayall)
+    fm.end()
+
+def _timeonewrite(ui, orig, startrev, stoprev, runidx=None):
+    timings = []
+    tr = type('faketr', (object,) , {'add': lambda s, x, y, z=None:None})()
+    with _temprevlog(ui, orig, startrev) as dest:
+        revs = list(orig.revs(startrev, stoprev))
+        total = len(revs)
+        topic = 'adding'
+        if runidx is not None:
+            topic += ' (run #%d)' % runidx
+        for idx, rev in enumerate(revs):
+            ui.progress(topic, idx, unit='revs', total=total)
+            text = orig.revision(rev)
+            linkrev = orig.linkrev(rev)
+            node = orig.node(rev)
+            p1, p2 = orig.parents(node)
+            with timeone() as r:
+                dest.addrevision(text, tr, linkrev, p1, p2, node=node)
+            timings.append((rev, r[0]))
+        ui.progress(topic, total, unit='revs', total=total)
+        ui.progress(topic, None, unit='revs', total=total)
+    return timings
+
+@contextlib.contextmanager
+def _temprevlog(ui, orig, truncaterev):
+    from mercurial import vfs as vfsmod
+
+    if orig._inline:
+        raise error.Abort('not supporting inline revlog (yet)')
+
+    origindexpath = orig.opener.join(orig.indexfile)
+    origdatapath = orig.opener.join(orig.datafile)
+    indexname = 'revlog.i'
+    dataname = 'revlog.d'
+
+    tmpdir = tempfile.mkdtemp(prefix='tmp-hgperf-')
+    try:
+        # copy the data file in a temporary directory
+        ui.debug('copying data in %s\n' % tmpdir)
+        destindexpath = os.path.join(tmpdir, 'revlog.i')
+        destdatapath = os.path.join(tmpdir, 'revlog.d')
+        shutil.copyfile(origindexpath, destindexpath)
+        shutil.copyfile(origdatapath, destdatapath)
+
+        # remove the data we want to add again
+        ui.debug('truncating data to be rewritten\n')
+        with open(destindexpath, 'ab') as index:
+            index.seek(0)
+            index.truncate(truncaterev * orig._io.size)
+        with open(destdatapath, 'ab') as data:
+            data.seek(0)
+            data.truncate(orig.start(truncaterev))
+
+        # instantiate a new revlog from the temporary copy
+        ui.debug('truncating adding to be rewritten\n')
+        vfs = vfsmod.vfs(tmpdir)
+        vfs.options = getattr(orig.opener, 'options', None)
+
+        dest = revlog.revlog(vfs,
+                             indexfile=indexname,
+                             datafile=dataname)
+        if dest._inline:
+            raise error.Abort('not supporting inline revlog (yet)')
+        # make sure internals are initialized
+        dest.revision(len(dest) - 1)
+        yield dest
+        del dest, vfs
+    finally:
+        shutil.rmtree(tmpdir, True)
+
 @command(b'perfrevlogchunks', revlogopts + formatteropts +
          [(b'e', b'engines', b'', b'compression engines to use'),
           (b's', b'startrev', 0, b'revision to start at')],
diff --git a/tests/test-contrib-perf.t b/tests/test-contrib-perf.t
--- a/tests/test-contrib-perf.t
+++ b/tests/test-contrib-perf.t
@@ -114,6 +114,8 @@  perfstatus
                  Benchmark obtaining a revlog revision.
    perfrevlogrevisions
                  Benchmark reading a series of revisions from a revlog.
+   perfrevlogwrite
+                 Benchmark writing a series of revisions to a revlog.
    perfrevrange  (no help text available)
    perfrevset    benchmark the execution time of a revset
    perfstartup   (no help text available)
@@ -265,4 +267,16 @@  Check perf.py for historical portability
   contrib/perf.py:\d+: (re)
    >     from mercurial import (
    import newer module separately in try clause for early Mercurial
+  contrib/perf.py:\d+: (re)
+   >     origindexpath = orig.opener.join(orig.indexfile)
+   use getvfs()/getsvfs() for early Mercurial
+  contrib/perf.py:\d+: (re)
+   >     origdatapath = orig.opener.join(orig.datafile)
+   use getvfs()/getsvfs() for early Mercurial
+  contrib/perf.py:\d+: (re)
+   >         vfs = vfsmod.vfs(tmpdir)
+   use getvfs()/getsvfs() for early Mercurial
+  contrib/perf.py:\d+: (re)
+   >         vfs.options = getattr(orig.opener, 'options', None)
+   use getvfs()/getsvfs() for early Mercurial
   [1]