Submitter | Boris Feld |
---|---|
Date | Nov. 12, 2018, 9:55 a.m. |
Message ID | <b77a6b74ef31e1b3706c.1542016542@localhost.localdomain> |
Download | mbox | patch |
Permalink | /patch/36515/ |
State | Accepted |
Headers | show |
Comments
On Mon, 12 Nov 2018 10:55:42 +0100, Boris Feld wrote: > # HG changeset patch > # User Boris Feld <boris.feld@octobus.net> > # Date 1541782717 -3600 > # Fri Nov 09 17:58:37 2018 +0100 > # Node ID b77a6b74ef31e1b3706c1c6127a15eede0334f71 > # Parent ddafb271512fc26de60da5dceffc1509bb023d66 > # EXP-Topic sparse-perf > # Available At https://bitbucket.org/octobus/mercurial-devel/ > # hg pull https://bitbucket.org/octobus/mercurial-devel/ -r b77a6b74ef31 > sparse-revlog: rework the way we enforce chunk size limit > > We move from a O(N) algorithm to a O(log(N)) algorithm. > > The previous algorithm was traversing the whole delta chain, looking for the > exact point where it became too big. This would result in most of the delta > chain to be traversed. > > Instead, we now use a "binary" approach, slicing the chain in two until we > have a chunk of the appropriate size. > > We still keep the previous algorithm for the snapshots part. There are few of > them and they are large bits of data distant from each other. So the previous > algorithm should work well in that case. > > To take a practical example of restoring manifest revision '59547c40bc4c' for > a reference NetBeans repository (using sparse-revlog). The media time of the > step `slice-sparse-chain` of `perfrevlogrevision` improve from 1.109 ms to > 0.660 ms. > > diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py > --- a/mercurial/revlogutils/deltas.py > +++ b/mercurial/revlogutils/deltas.py > @@ -176,18 +176,22 @@ def _slicechunktosize(revlog, revs, targ > [[3], [5]] > """ > assert targetsize is None or 0 <= targetsize > - if targetsize is None or segmentspan(revlog, revs) <= targetsize: > + startdata = revlog.start(revs[0]) > + enddata = revlog.end(revs[-1]) > + fullspan = enddata - startdata > + if targetsize is None or fullspan <= targetsize: > yield revs > return > > startrevidx = 0 > - startdata = revlog.start(revs[0]) > endrevidx = 0 > iterrevs = enumerate(revs) > next(iterrevs) # skip first rev. > + # first step: get snapshots out of the way > for idx, r in iterrevs: > span = revlog.end(r) - startdata > - if span <= targetsize: > + snapshot = revlog.issnapshot(r) This should break the doctest if this module were included in test-doctest.py. > + if span <= targetsize and snapshot: > endrevidx = idx > else: > chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1) > @@ -196,6 +200,29 @@ def _slicechunktosize(revlog, revs, targ > startrevidx = idx > startdata = revlog.start(r) > endrevidx = idx > + if not snapshot: > + break > + > + # for the others, we use binary slicing to quickly converge toward valid > + # chunks (otherwise, we might end up looking for start/end of many > + # revisions) > + nbitem = len(revs) > + while (enddata - startdata) > targetsize: > + endrevidx = nbitem > + if nbitem - startrevidx <= 1: > + break # protect against individual chunk larger than limit > + localenddata = revlog.end(revs[endrevidx - 1]) > + span = localenddata - startdata > + while (localenddata - startdata) > targetsize: > + if endrevidx - startrevidx <= 1: > + break # protect against individual chunk larger than limit > + endrevidx -= (endrevidx - startrevidx) // 2 > + localenddata = revlog.end(revs[endrevidx -1]) > + span = localenddata - startdata The variable 'span' isn't used. > + yield _trimchunk(revlog, revs, startrevidx, endrevidx) > + startrevidx = endrevidx > + startdata = revlog.start(revs[startrevidx]) Do we have enough test coverage for this logic? FWIW, I'm a little confused that 'endrevidx' only gets smaller and smaller, since I thought this would do binary-search the split point. That turned out to be wrong. We look for approximate positions to split.
On 13/11/2018 13:47, Yuya Nishihara wrote: > On Mon, 12 Nov 2018 10:55:42 +0100, Boris Feld wrote: >> # HG changeset patch >> # User Boris Feld <boris.feld@octobus.net> >> # Date 1541782717 -3600 >> # Fri Nov 09 17:58:37 2018 +0100 >> # Node ID b77a6b74ef31e1b3706c1c6127a15eede0334f71 >> # Parent ddafb271512fc26de60da5dceffc1509bb023d66 >> # EXP-Topic sparse-perf >> # Available At https://bitbucket.org/octobus/mercurial-devel/ >> # hg pull https://bitbucket.org/octobus/mercurial-devel/ -r b77a6b74ef31 >> sparse-revlog: rework the way we enforce chunk size limit >> We move from a O(N) algorithm to a O(log(N)) algorithm. >> >> The previous algorithm was traversing the whole delta chain, looking for the >> exact point where it became too big. This would result in most of the delta >> chain to be traversed. >> >> Instead, we now use a "binary" approach, slicing the chain in two until we >> have a chunk of the appropriate size. >> >> We still keep the previous algorithm for the snapshots part. There are few of >> them and they are large bits of data distant from each other. So the previous >> algorithm should work well in that case. >> >> To take a practical example of restoring manifest revision '59547c40bc4c' for >> a reference NetBeans repository (using sparse-revlog). The media time of the >> step `slice-sparse-chain` of `perfrevlogrevision` improve from 1.109 ms to >> 0.660 ms. >> >> diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py >> --- a/mercurial/revlogutils/deltas.py >> +++ b/mercurial/revlogutils/deltas.py >> @@ -176,18 +176,22 @@ def _slicechunktosize(revlog, revs, targ >> [[3], [5]] >> """ >> assert targetsize is None or 0 <= targetsize >> - if targetsize is None or segmentspan(revlog, revs) <= targetsize: >> + startdata = revlog.start(revs[0]) >> + enddata = revlog.end(revs[-1]) >> + fullspan = enddata - startdata >> + if targetsize is None or fullspan <= targetsize: >> yield revs >> return >> >> startrevidx = 0 >> - startdata = revlog.start(revs[0]) >> endrevidx = 0 >> iterrevs = enumerate(revs) >> next(iterrevs) # skip first rev. >> + # first step: get snapshots out of the way >> for idx, r in iterrevs: >> span = revlog.end(r) - startdata >> - if span <= targetsize: >> + snapshot = revlog.issnapshot(r) > This should break the doctest if this module were included in test-doctest.py. Good catch, these tests stop being covered when they moved to a submodule. > >> + if span <= targetsize and snapshot: >> endrevidx = idx >> else: >> chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1) >> @@ -196,6 +200,29 @@ def _slicechunktosize(revlog, revs, targ >> startrevidx = idx >> startdata = revlog.start(r) >> endrevidx = idx >> + if not snapshot: >> + break >> + >> + # for the others, we use binary slicing to quickly converge toward valid >> + # chunks (otherwise, we might end up looking for start/end of many >> + # revisions) >> + nbitem = len(revs) >> + while (enddata - startdata) > targetsize: >> + endrevidx = nbitem >> + if nbitem - startrevidx <= 1: >> + break # protect against individual chunk larger than limit >> + localenddata = revlog.end(revs[endrevidx - 1]) >> + span = localenddata - startdata >> + while (localenddata - startdata) > targetsize: >> + if endrevidx - startrevidx <= 1: >> + break # protect against individual chunk larger than limit >> + endrevidx -= (endrevidx - startrevidx) // 2 >> + localenddata = revlog.end(revs[endrevidx -1]) >> + span = localenddata - startdata > The variable 'span' isn't used. > >> + yield _trimchunk(revlog, revs, startrevidx, endrevidx) >> + startrevidx = endrevidx >> + startdata = revlog.start(revs[startrevidx]) > Do we have enough test coverage for this logic? > > FWIW, I'm a little confused that 'endrevidx' only gets smaller and smaller, > since I thought this would do binary-search the split point. That turned out > to be wrong. We look for approximate positions to split. Indeed, we are just looking for quickly converging toward a valid chunk. Finding a perfect slicing point requires more work and is rarely worth it. For example if you have 6 consecutive size 2 delta (total size 12) and the chunk limit is 10 there are many valid slicing. [2, 2, 2] [2, 2, 2] is as valid as [2, 2, 2, 2, 2] + [2] > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@mercurial-scm.org > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Patch
diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py --- a/mercurial/revlogutils/deltas.py +++ b/mercurial/revlogutils/deltas.py @@ -176,18 +176,22 @@ def _slicechunktosize(revlog, revs, targ [[3], [5]] """ assert targetsize is None or 0 <= targetsize - if targetsize is None or segmentspan(revlog, revs) <= targetsize: + startdata = revlog.start(revs[0]) + enddata = revlog.end(revs[-1]) + fullspan = enddata - startdata + if targetsize is None or fullspan <= targetsize: yield revs return startrevidx = 0 - startdata = revlog.start(revs[0]) endrevidx = 0 iterrevs = enumerate(revs) next(iterrevs) # skip first rev. + # first step: get snapshots out of the way for idx, r in iterrevs: span = revlog.end(r) - startdata - if span <= targetsize: + snapshot = revlog.issnapshot(r) + if span <= targetsize and snapshot: endrevidx = idx else: chunk = _trimchunk(revlog, revs, startrevidx, endrevidx + 1) @@ -196,6 +200,29 @@ def _slicechunktosize(revlog, revs, targ startrevidx = idx startdata = revlog.start(r) endrevidx = idx + if not snapshot: + break + + # for the others, we use binary slicing to quickly converge toward valid + # chunks (otherwise, we might end up looking for start/end of many + # revisions) + nbitem = len(revs) + while (enddata - startdata) > targetsize: + endrevidx = nbitem + if nbitem - startrevidx <= 1: + break # protect against individual chunk larger than limit + localenddata = revlog.end(revs[endrevidx - 1]) + span = localenddata - startdata + while (localenddata - startdata) > targetsize: + if endrevidx - startrevidx <= 1: + break # protect against individual chunk larger than limit + endrevidx -= (endrevidx - startrevidx) // 2 + localenddata = revlog.end(revs[endrevidx -1]) + span = localenddata - startdata + yield _trimchunk(revlog, revs, startrevidx, endrevidx) + startrevidx = endrevidx + startdata = revlog.start(revs[startrevidx]) + yield _trimchunk(revlog, revs, startrevidx) def _slicechunktodensity(revlog, revs, targetdensity=0.5,