Submitter | Pierre-Yves David |
---|---|
Date | May 20, 2017, 3:30 p.m. |
Message ID | <221be1ef98902fa695a7.1495294215@nodosa.octopoid.net> |
Download | mbox | patch |
Permalink | /patch/20771/ |
State | Deferred |
Headers | show |
Comments
I have got a chance to test this in production with one of our biggest repo and a 18MB obsstore. I tried running "hg id" with a a minimal hgrc (with just lz4revlog and remotefilelog enabled) and without an existing obscache. dualsourcecache.update takes 9 seconds. 7 seconds are used to scan changelog, 0.36 seconds used to scan obsstore. The related profiling data is as below: 9735 | _computeobsoleteset obsolete.py:1554 9421 \ update obsolete.py:1227 1872 \ _upgradeneeded obsolete.py:1311 434 \ revs (many times) changelog.py:319 1179 \ __iter__ obsolete.py:564 1179 | __get__ util.py:798 1177 | _all obsolete.py:707 1138 | _readmarkers obsolete.py:444 1138 | _fm1readmarkers obsolete.py:432 7549 \ _updatefrom obsolete.py:1433 * 7185 \ _updaterevs obsolete.py:1439 766 \ __get__ util.py:798 766 | successors obsolete.py:717 766 | _addsuccessors obsolete.py:506 * 3964 \ node (many times) changelog.py:359 364 \ _updatemarkers obsolete.py:1471 So it is expensive for us to rebuild the cache. It seems the cache may be dropped in some cases (ex. histedit --abort). I wonder if we can make the cache more persistent so it'll be impossible to be nuked entirely, like when doing a strip, just truncate the cache file to the strip point instead of nuking it. Excerpts from Pierre-Yves David's message of 2017-05-20 17:30:15 +0200: > # HG changeset patch > # User Pierre-Yves David <pierre-yves.david@octobus.net> > # Date 1495191830 -7200 > # Fri May 19 13:03:50 2017 +0200 > # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a > # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 > # EXP-Topic obscache > # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ > # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 > obsstore: add a 'cachekey' method > > Parsing the full obsstore is slow, so cache that depends on obsstore content > wants a way to know if the obsstore changed, and it this change was append only. > > For this purpose we introduce an official cachekey for the obsstore. This cache > key work in a way similar to the '(tiprev, tipnode)' pair used for the > changelog. We use the size of the obsstore file and the hash of its tail. That > way, we can check if the obsstore grew and if the content we knew is still > present in the obsstore. > > This will be used in later changeset to cache related to the obsolete property. > > diff --git a/mercurial/obsolete.py b/mercurial/obsolete.py > --- a/mercurial/obsolete.py > +++ b/mercurial/obsolete.py > @@ -70,6 +70,7 @@ comment associated with each format for > from __future__ import absolute_import > > import errno > +import hashlib > import struct > > from .i18n import _ > @@ -547,6 +548,8 @@ class obsstore(object): > # parents: (tuple of nodeid) or None, parents of precursors > # None is used when no data has been recorded > > + _obskeysize = 200 > + > def __init__(self, svfs, defaultformat=_fm1version, readonly=False): > # caches for various obsolescence related cache > self.caches = {} > @@ -574,6 +577,46 @@ class obsstore(object): > > __bool__ = __nonzero__ > > + def cachekey(self, index=None): > + """return (current-length, cachekey) > + > + 'current-length': is the current length of the obsstore storage file, > + 'cachekey' is the hash of the last 200 bytes ending at 'index'. > + > + If 'index' is unspecified, current obsstore length is used. > + Cacheckey will be set to nullid if the obsstore is empty. > + 'current-lenght' is -always- the current obsstore length, regardless of > + the 'index' value. > + > + If the index specified is higher than the current obsstore file > + length, cachekey will be set to None.""" > + # default value > + obsstoresize = 0 > + keydata = '' > + # try to get actual data from the obsstore > + try: > + with self.svfs('obsstore') as obsfile: > + obsfile.seek(0, 2) > + obsstoresize = obsfile.tell() > + if index is None: > + index = obsstoresize > + elif obsstoresize < index: > + return obsstoresize, None > + actualsize = min(index, self._obskeysize) > + if actualsize: > + obsfile.seek(index - actualsize, 0) > + keydata = obsfile.read(actualsize) > + except (OSError, IOError) as e: > + if e.errno != errno.ENOENT: > + raise > + if keydata: > + key = hashlib.sha1(keydata).digest() > + else: > + # reusing an existing "empty" value make it easier to define a > + # default cachekey for 'no data'. > + key = node.nullid > + return obsstoresize, key > + > @property > def readonly(self): > """True if marker creation is disabled
On 05/23/2017 04:22 AM, Jun Wu wrote: > I have got a chance to test this in production with one of our biggest repo > and a 18MB obsstore. I tried running "hg id" with a a minimal hgrc (with > just lz4revlog and remotefilelog enabled) and without an existing obscache. > > dualsourcecache.update takes 9 seconds. 7 seconds are used to scan > changelog, 0.36 seconds used to scan obsstore. > > The related profiling data is as below: > > 9735 | _computeobsoleteset obsolete.py:1554 > 9421 \ update obsolete.py:1227 > 1872 \ _upgradeneeded obsolete.py:1311 > 434 \ revs (many times) changelog.py:319 > 1179 \ __iter__ obsolete.py:564 > 1179 | __get__ util.py:798 > 1177 | _all obsolete.py:707 > 1138 | _readmarkers obsolete.py:444 > 1138 | _fm1readmarkers obsolete.py:432 > 7549 \ _updatefrom obsolete.py:1433 > * 7185 \ _updaterevs obsolete.py:1439 > 766 \ __get__ util.py:798 > 766 | successors obsolete.py:717 > 766 | _addsuccessors obsolete.py:506 > * 3964 \ node (many times) changelog.py:359 > 364 \ _updatemarkers obsolete.py:1471 > > So it is expensive for us to rebuild the cache. It seems the cache may be > dropped in some cases (ex. histedit --abort). I wonder if we can make the > cache more persistent so it'll be impossible to be nuked entirely, like when > doing a strip, just truncate the cache file to the strip point instead of > nuking it. The simplest solution I can see here is to backup the cache when histedit/rebase starts and reinstall the cache when the operation is aborted. That way we can incrementally upgrade from that backup point. This approach can be generalized to other caches suffering from strip. Stripping the cache is also an option, but only in the case where only the changelog has been stripped (not the obsstore). (note: there are likely simple optimization lurking in the update function too. For example is len(obsstore) << len(repo), we could focus on iterating on the obsstore only when rebuilding the cache, etc…). Does these details address your concerns? Cheers,
Please ignore this email, I've unified it with the reply to patch 5, and then … I sent the wrong one. On 05/23/2017 11:47 AM, Pierre-Yves David wrote: > > > On 05/23/2017 04:22 AM, Jun Wu wrote: >> I have got a chance to test this in production with one of our biggest >> repo >> and a 18MB obsstore. I tried running "hg id" with a a minimal hgrc (with >> just lz4revlog and remotefilelog enabled) and without an existing >> obscache. >> >> dualsourcecache.update takes 9 seconds. 7 seconds are used to scan >> changelog, 0.36 seconds used to scan obsstore. >> >> The related profiling data is as below: >> >> 9735 | _computeobsoleteset obsolete.py:1554 >> 9421 \ update obsolete.py:1227 >> 1872 \ _upgradeneeded obsolete.py:1311 >> 434 \ revs (many times) changelog.py:319 >> 1179 \ __iter__ obsolete.py:564 >> 1179 | __get__ util.py:798 >> 1177 | _all obsolete.py:707 >> 1138 | _readmarkers obsolete.py:444 >> 1138 | _fm1readmarkers obsolete.py:432 >> 7549 \ _updatefrom obsolete.py:1433 >> * 7185 \ _updaterevs obsolete.py:1439 >> 766 \ __get__ util.py:798 >> 766 | successors obsolete.py:717 >> 766 | _addsuccessors obsolete.py:506 >> * 3964 \ node (many times) changelog.py:359 >> 364 \ _updatemarkers obsolete.py:1471 >> >> So it is expensive for us to rebuild the cache. It seems the cache may be >> dropped in some cases (ex. histedit --abort). I wonder if we can make the >> cache more persistent so it'll be impossible to be nuked entirely, >> like when >> doing a strip, just truncate the cache file to the strip point instead of >> nuking it. > > The simplest solution I can see here is to backup the cache when > histedit/rebase starts and reinstall the cache when the operation is > aborted. That way we can incrementally upgrade from that backup point. > This approach can be generalized to other caches suffering from strip. > > Stripping the cache is also an option, but only in the case where only > the changelog has been stripped (not the obsstore). > > (note: there are likely simple optimization lurking in the update > function too. For example is len(obsstore) << len(repo), we could focus > on iterating on the obsstore only when rebuilding the cache, etc…). > > > Does these details address your concerns? > > Cheers, >
On 05/20/2017 05:30 PM, Pierre-Yves David wrote: > # HG changeset patch > # User Pierre-Yves David <pierre-yves.david@octobus.net> > # Date 1495191830 -7200 > # Fri May 19 13:03:50 2017 +0200 > # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a > # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 > # EXP-Topic obscache > # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ > # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 > obsstore: add a 'cachekey' method What is the status of this series. V2 has been on the list for over 10 days and I still see it in patchwork. How can I help having it move forward. There are various other improvements stuck behind this series. Cheers,
Excerpts from Pierre-Yves David's message of 2017-06-01 18:15:17 +0200: > On 05/20/2017 05:30 PM, Pierre-Yves David wrote: > > # HG changeset patch > > # User Pierre-Yves David <pierre-yves.david@octobus.net> > > # Date 1495191830 -7200 > > # Fri May 19 13:03:50 2017 +0200 > > # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a > > # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 > > # EXP-Topic obscache > > # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ > > # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 > > obsstore: add a 'cachekey' method > > What is the status of this series. V2 has been on the list for over 10 > days and I still see it in patchwork. How can I help having it move > forward. There are various other improvements stuck behind this series. I think V2 radixlink will be fast enough so this series look unnecessary. I think adding revision numbers to obsstore components is what we should avoid in design. Not to say this series has perf issues with huge repos. > Cheers, >
On Thu, Jun 1, 2017 at 9:48 AM, Jun Wu <quark@fb.com> wrote: > Excerpts from Pierre-Yves David's message of 2017-06-01 18:15:17 +0200: >> On 05/20/2017 05:30 PM, Pierre-Yves David wrote: >> > # HG changeset patch >> > # User Pierre-Yves David <pierre-yves.david@octobus.net> >> > # Date 1495191830 -7200 >> > # Fri May 19 13:03:50 2017 +0200 >> > # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a >> > # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 >> > # EXP-Topic obscache >> > # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ >> > # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 >> > obsstore: add a 'cachekey' method >> >> What is the status of this series. V2 has been on the list for over 10 >> days and I still see it in patchwork. How can I help having it move >> forward. There are various other improvements stuck behind this series. > > I think V2 radixlink will be fast enough so this series look unnecessary. > > I think adding revision numbers to obsstore components is what we should > avoid in design. > > Not to say this series has perf issues with huge repos. It looks like we need someone less biased than you two to decide which (or both) of the series to take :-) I just need to find time to review both of the series and see what I think (and then we'll continue discussing, I guess). Or we could VC all three? > >> Cheers, >> > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@mercurial-scm.org > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
On 06/01/2017 07:05 PM, Martin von Zweigbergk wrote: > On Thu, Jun 1, 2017 at 9:48 AM, Jun Wu <quark@fb.com> wrote: >> Excerpts from Pierre-Yves David's message of 2017-06-01 18:15:17 +0200: >>> On 05/20/2017 05:30 PM, Pierre-Yves David wrote: >>>> # HG changeset patch >>>> # User Pierre-Yves David <pierre-yves.david@octobus.net> >>>> # Date 1495191830 -7200 >>>> # Fri May 19 13:03:50 2017 +0200 >>>> # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a >>>> # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 >>>> # EXP-Topic obscache >>>> # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ >>>> # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 >>>> obsstore: add a 'cachekey' method >>> >>> What is the status of this series. V2 has been on the list for over 10 >>> days and I still see it in patchwork. How can I help having it move >>> forward. There are various other improvements stuck behind this series. >> >> I think V2 radixlink will be fast enough so this series look unnecessary. >> >> I think adding revision numbers to obsstore components is what we should >> avoid in design. >> >> Not to say this series has perf issues with huge repos. > > It looks like we need someone less biased than you two to decide which > (or both) of the series to take :-) I just need to find time to review > both of the series and see what I think (and then we'll continue > discussing, I guess). Or we could VC all three? Both series are complementary and useful. Obscache is very efficient for its purpose and the other one improve other aspect of evolution related computation. We already have many rev indexed caches so nothing really new here. The potential scaling issue on large repository of the obscache are easy to overcome, I've a series fixing most of them already ready to be sent once that one is in. I would be happy to join a VC to discuss this. Cheers,
Pierre-Yves David <pierre-yves.david@ens-lyon.org> writes: > On 06/01/2017 07:05 PM, Martin von Zweigbergk wrote: >> On Thu, Jun 1, 2017 at 9:48 AM, Jun Wu <quark@fb.com> wrote: >>> Excerpts from Pierre-Yves David's message of 2017-06-01 18:15:17 +0200: >>>> On 05/20/2017 05:30 PM, Pierre-Yves David wrote: >>>>> # HG changeset patch >>>>> # User Pierre-Yves David <pierre-yves.david@octobus.net> >>>>> # Date 1495191830 -7200 >>>>> # Fri May 19 13:03:50 2017 +0200 >>>>> # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a >>>>> # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 >>>>> # EXP-Topic obscache >>>>> # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ >>>>> # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 >>>>> obsstore: add a 'cachekey' method >>>> >>>> What is the status of this series. V2 has been on the list for over 10 >>>> days and I still see it in patchwork. How can I help having it move >>>> forward. There are various other improvements stuck behind this series. >>> >>> I think V2 radixlink will be fast enough so this series look unnecessary. >>> >>> I think adding revision numbers to obsstore components is what we should >>> avoid in design. >>> >>> Not to say this series has perf issues with huge repos. >> >> It looks like we need someone less biased than you two to decide which >> (or both) of the series to take :-) I just need to find time to review >> both of the series and see what I think (and then we'll continue >> discussing, I guess). Or we could VC all three? > > Both series are complementary and useful. Obscache is very efficient for > its purpose and the other one improve other aspect of evolution related > computation. We already have many rev indexed caches so nothing really > new here. This was my thinking as well. Though, I'm not trying to muddy the waters here. > The potential scaling issue on large repository of the obscache are easy > to overcome, I've a series fixing most of them already ready to be sent > once that one is in. Ah, are those online somewhere? > I would be happy to join a VC to discuss this. I might like to join but if the schedule doesn't work out, please move ahead without me.
Excerpts from Sean Farley's message of 2017-06-01 13:27:44 -0700: > > Both series are complementary and useful. Obscache is very efficient for > > its purpose and the other one improve other aspect of evolution related > > computation. We already have many rev indexed caches so nothing really > > new here. > > This was my thinking as well. Though, I'm not trying to muddy the waters > here. There are 4 revsets: obsolete, unstable, bumped, and divergent. My patch speeds up all of them significantly while obscache only speeds up the first one. I admit that obscache is slightly faster than indexing on the "obsolete" revset. But that perf difference would be like just 20ms for hg-committed. I think that 20ms does not justify the complexity of a complete (huge-repo-friendly) implementation obscache and the coupling with revision number in design. And I do think eventually we want the related revset to be lazy so there won't be noticeable perf difference. That said, I think obsstore "cachekey" could be useful to help remove "hash(filteredrevs)" and allow us to remove iterating filteredrevs to be able to finally make related revset lazy. > > The potential scaling issue on large repository of the obscache are easy > > to overcome, I've a series fixing most of them already ready to be sent > > once that one is in. > > Ah, are those online somewhere? > > > I would be happy to join a VC to discuss this. > > I might like to join but if the schedule doesn't work out, please move > ahead without me.
On Thu, 1 Jun 2017 15:28:19 -0700, Jun Wu wrote: > Excerpts from Sean Farley's message of 2017-06-01 13:27:44 -0700: > > > Both series are complementary and useful. Obscache is very efficient for > > > its purpose and the other one improve other aspect of evolution related > > > computation. We already have many rev indexed caches so nothing really > > > new here. > > > > This was my thinking as well. Though, I'm not trying to muddy the waters > > here. > > There are 4 revsets: obsolete, unstable, bumped, and divergent. My patch > speeds up all of them significantly while obscache only speeds up the first > one. > > I admit that obscache is slightly faster than indexing on the "obsolete" > revset. But that perf difference would be like just 20ms for hg-committed. I > think that 20ms does not justify the complexity of a complete > (huge-repo-friendly) implementation obscache and the coupling with revision > number in design. And I do think eventually we want the related revset to be > lazy so there won't be noticeable perf difference. I haven't read these patches carefully, but I like Jun's radixlink idea which seemed clever. If the perf win is just a few tens of milliseconds, I prefer not having two separate cache layers that could complicate things in future.
On 06/01/2017 10:27 PM, Sean Farley wrote: > Pierre-Yves David <pierre-yves.david@ens-lyon.org> writes: > >> On 06/01/2017 07:05 PM, Martin von Zweigbergk wrote: >>> On Thu, Jun 1, 2017 at 9:48 AM, Jun Wu <quark@fb.com> wrote: >>>> Excerpts from Pierre-Yves David's message of 2017-06-01 18:15:17 +0200: >>>>> On 05/20/2017 05:30 PM, Pierre-Yves David wrote: >>>>>> # HG changeset patch >>>>>> # User Pierre-Yves David <pierre-yves.david@octobus.net> >>>>>> # Date 1495191830 -7200 >>>>>> # Fri May 19 13:03:50 2017 +0200 >>>>>> # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a >>>>>> # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 >>>>>> # EXP-Topic obscache >>>>>> # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ >>>>>> # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 >>>>>> obsstore: add a 'cachekey' method >>>>> >>>>> What is the status of this series. V2 has been on the list for over 10 >>>>> days and I still see it in patchwork. How can I help having it move >>>>> forward. There are various other improvements stuck behind this series. >>>> >>>> I think V2 radixlink will be fast enough so this series look unnecessary. >>>> >>>> I think adding revision numbers to obsstore components is what we should >>>> avoid in design. >>>> >>>> Not to say this series has perf issues with huge repos. >>> >>> It looks like we need someone less biased than you two to decide which >>> (or both) of the series to take :-) I just need to find time to review >>> both of the series and see what I think (and then we'll continue >>> discussing, I guess). Or we could VC all three? >> >> Both series are complementary and useful. Obscache is very efficient for >> its purpose and the other one improve other aspect of evolution related >> computation. We already have many rev indexed caches so nothing really >> new here. > > This was my thinking as well. Though, I'm not trying to muddy the waters > here. > >> The potential scaling issue on large repository of the obscache are easy >> to overcome, I've a series fixing most of them already ready to be sent >> once that one is in. > > Ah, are those online somewhere? Yes, you can find them at: https://www.mercurial-scm.org/repo/users/marmoute/mercurial/log?rev=7d2b6e3cec41%3A%3A Cheers,
On 06/02/2017 03:10 PM, Yuya Nishihara wrote: > On Thu, 1 Jun 2017 15:28:19 -0700, Jun Wu wrote: >> Excerpts from Sean Farley's message of 2017-06-01 13:27:44 -0700: >>>> Both series are complementary and useful. Obscache is very efficient for >>>> its purpose and the other one improve other aspect of evolution related >>>> computation. We already have many rev indexed caches so nothing really >>>> new here. >>> >>> This was my thinking as well. Though, I'm not trying to muddy the waters >>> here. >> >> There are 4 revsets: obsolete, unstable, bumped, and divergent. My patch >> speeds up all of them significantly while obscache only speeds up the first >> one. >> >> I admit that obscache is slightly faster than indexing on the "obsolete" >> revset. But that perf difference would be like just 20ms for hg-committed. I >> think that 20ms does not justify the complexity of a complete >> (huge-repo-friendly) implementation obscache and the coupling with revision >> number in design. And I do think eventually we want the related revset to be >> lazy so there won't be noticeable perf difference. > > I haven't read these patches carefully, but I like Jun's radixlink idea which > seemed clever. If the perf win is just a few tens of milliseconds, I prefer not > having two separate caches layers that could complicate things in future. Indexes are definitely a good thing and we will want some at some point. However, the radix tree series on the list an early RFC[1] while this obscache series is a port from the evolve extension of code in the running in many places for over a month. There are various things to clarify and adjust from that RFC series (eg: collaboration with the transaction) before we can consider taking it. So to me the way forward here is to get the existing working solution in now and incorporates the other one later. As a bonus point, they will have fairly similar logic for cache validation and incremental upgrade. So work from obscache will benefit the work on the indexes. If the obscache happens to be "obsoleted" by something new and better (eg: radix tree?), we can drop the obscache at that point. This just happens to the "hiddencache" which got recently dropped since hidden computation got much faster. In addition, it is unclear, we'll want to drop the obscache once the the indexes land. Timing wise, the RFC version (using some C), compute the obsolete set is 45.8ms vs 1.2ms (no extra C) for the obscaches[2]. This is a big save on the startup time.In the same repository (no extension but evolve-obscache), hg version take 80ms seconds. 44ms is over 50% of that. For a more practical command `hg diff -r .^` is 170ms, so tens of millisecond will be tens of percent slow down in impact for fast command (and we have a small amount of obsolete revisions: about 6K). The obscache is using revision indexing, this mean no rev to node conversion and a more direct data access. To compute the set of obsolete changeset (part of startup) this meant it will remains significantly faster than an indexes approach. We independently want the indexes approach to solve other issue. Finally, there are other evolve related area were we will want caches, this series install the necessary infrastructure to easily add such caches. For example obsolescence-markers-discovery is an area were we know we'll need caching using this kind of infrastructure to detect and perform update. In addition, various troubles[3] are a bit complex to compute, I expect us to want to keep them in a cache too. Some of that logic can even be reused for the radix tree implementation. Cheers,
Excerpts from Pierre-Yves David's message of 2017-06-03 12:12:55 +0200: > Indexes are definitely a good thing and we will want some at some point. > > However, the radix tree series on the list an early RFC[1] while this > obscache series is a port from the evolve extension of code in the > running in many places for over a month. I think I'm nearly completing it. As a side effect I did many small cleanups in obsolete.py > There are various things to clarify and adjust from that RFC series (eg: > collaboration with the transaction) before we can consider taking it. > > So to me the way forward here is to get the existing working solution in > now and incorporates the other one later. As a bonus point, they will > have fairly similar logic for cache validation and incremental upgrade. > So work from obscache will benefit the work on the indexes. If the > obscache happens to be "obsoleted" by something new and better (eg: > radix tree?), we can drop the obscache at that point. This just happens > to the "hiddencache" which got recently dropped since hidden computation > got much faster. > > In addition, it is unclear, we'll want to drop the obscache once the the > indexes land. Timing wise, the RFC version (using some C), compute the > obsolete set is 45.8ms vs 1.2ms (no extra C) for the obscaches[2]. This That 44ms data is outdated. My current version is even 10ms faster than obscache for "hg id". I didn't expect it to be, though. If the perf difference is within +/-20ms, I think it's fair to say this 500-ish lines obscache implementation seem over complicated. > is a big save on the startup time.In the same repository (no extension > but evolve-obscache), hg version take 80ms seconds. 44ms is over 50% of > that. For a more practical command `hg diff -r .^` is 170ms, so tens of > millisecond will be tens of percent slow down in impact for fast command > (and we have a small amount of obsolete revisions: about 6K). > The obscache is using revision indexing, this mean no rev to node > conversion and a more direct data access. To compute the set of obsolete > changeset (part of startup) this meant it will remains significantly > faster than an indexes approach. We independently want the indexes > approach to solve other issue. > > Finally, there are other evolve related area were we will want caches, > this series install the necessary infrastructure to easily add such > caches. For example obsolescence-markers-discovery is an area were we > know we'll need caching using this kind of infrastructure to detect and > perform update. In addition, various troubles[3] are a bit complex to > compute, I expect us to want to keep them in a cache too. Some of that > logic can even be reused for the radix tree implementation. As I said in another thread, I think a better long term direction is to make related revset lazy. People usually run `hg log -r "small-set"`, they won't care about things not being outputted. Today when I run "hg log -r .", hg checks 5k+ commits for obsolete revset, I don't think we want to keep that behavior forever. If we go that "lazy" direction, the cache approach seems unnecessary in general. I'm not sure about obsolescence-markers-discovery. But I do have some ideas in that area too. I guess we have to see the actual logic in that area to judge whether dualsourcecache is worthwhile or not. > > Cheers, >
Excerpts from Pierre-Yves David's message of 2017-06-04 04:50:22 +0200: > [...] > > That get me to think a bit more about that part, trying to pin point the > apparent misunderstanding about the discussion around this series. And I > think I got it :-) > > Overall this series adds: > > 1) a generic cache key for the obsstore, to be used by any caches > > 2) an abstract class using the above key to build any kind of caches > that just need to implement a "update yourself with these new markers" + > storage. (That class is currently combined with a changelog related key, > see below) > > 3) a concrete cache using the above(bytearray to cache a small property) > > 4) all the logic around transaction to have that cache updated and > flushed properly. > > > In the above list, points (1), (2) and (3) are -also- needed by the > indexes cache and would help its implementation. Dropping the obscache I'm not sure how did you get that conclusion. I didn't use any of them naturally. > in favor of the indexes means dropping a small part of the above series > only, the rest can be kept around (or updated in place to access a > different cache with the same API). This highlight that the two series > are not competing, they are complementary. > > The incremental update and race condition prevention logic is quite > important and the version in this series has been battle tested for a month. I think the race condition problem is created by the unnecessary complexity of dual source. The index has a single append-only source and it uses atomictemp so its handling is clean and easy to be verified. > In addition, the version of the abstract class in this series is kept > simple for clarity purposes. However there are many improvement to be > made to it. Some are already implemented in the evolve extension, some > needs works into core (I've seen some of the needed bits done by Jun in > his RFC). Such improvement will benefit all obsstore related cache > (including the one in the evolve extension). I've been looking forward > to implement such improvement for a couple of weeks now :-/ The abstraction might be useful for other purposes. But I'm not convinced that they are necessary for now. Maybe I will change my mind when I start thinking about marker exchange. > --- > > About the changelog bit in "dualsourcecache". The abstract class use a > key checking both the changelog (like branchmap, branchrev, tagnode, > etc) and the obsstore. It does so because my current two concrete cache > (obscache and obshashrange-cache) need use these two items. We could > easily split the obsstore part out for cache that rely on the obsstore > content only (eg: the jun radix tree). Having that logic right and > easily reusable it a critical point to make progesss on evolution. If > that help I can follow up making that logic usable outside of > dualsourcecache. That would let me work on improving that abstract base > class and allow indexes to reuse it, making that work easier. I cannot foresee why I want to use "dualsourcecache" in indexing. > Cheers, > > >[…] >
On Sat, May 20, 2017 at 8:30 AM, Pierre-Yves David <pierre-yves.david@ens-lyon.org> wrote: > # HG changeset patch > # User Pierre-Yves David <pierre-yves.david@octobus.net> > # Date 1495191830 -7200 > # Fri May 19 13:03:50 2017 +0200 > # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a > # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 > # EXP-Topic obscache > # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ > # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 > obsstore: add a 'cachekey' method > > Parsing the full obsstore is slow, so cache that depends on obsstore content > wants a way to know if the obsstore changed, and it this change was append only. > > For this purpose we introduce an official cachekey for the obsstore. This cache > key work in a way similar to the '(tiprev, tipnode)' pair used for the > changelog. We use the size of the obsstore file and the hash of its tail. That > way, we can check if the obsstore grew and if the content we knew is still > present in the obsstore. Jun suggested to me personally that we could use a UUID and the size as a cache key. The UUID would be replaced only when the file was stripped. It could be stored inside a fake obsmarker as the first obsmarker in the file. That sounds like a good idea to me. What do you think? We could do the same for the changelog, but that would require a new format and requirement, so that's a much bigger change.
Excerpts from Martin von Zweigbergk's message of 2017-06-06 10:01:34 -0700: > Here are some timings from a hopefully unbiased third party. These are > best-of-20 times, in seconds. No extensions enabled. I ran > "debugupdatecaches" before each series of commands to populate the > caches (I think both obscache and the radixlink cache are hooked into > that command). > > > | At @ | obscache | radixlink | > id -r @ | 0.70 | 0.38 | 0.42 | > log -r @ | 1.60 | 1.08 | 0.59 | > export @ | 0.71 | 0.33 | 0.59 | > log -G -r 'draft()' | 1.19 | 1.22 | 0.74 | > log -G -r 'draft()' \ | 1.26 | 1.27 | 0.97 | > -T '{node} {troubles}' | | | | Note this is a repo with a small len(changelog) and I think cache rebuilding time matters. Other parts of Mercurial are friendly to large len(changelog), I think it's better to not break that property. > We write Mercurial for users, so I've included commands similar to > what I use somewhat often (plus "hg id" because Pierre-Yves and Jun > like it). > > So radixlink seems to provide a nice speedup across the board. So I > vote for getting that series in good enough shape to include it and > then we can come *consider* adding obscache on top after. I know > Pierre-Yves wanted to get the cachekey and dualsourcecache pieces in > at least to simplify further work in the evolve extension, but I'm > very reluctant to add otherwise unused code (more than just a few > lines) to core for that reason. What do other reviewers think? > [...]
On Tue, Jun 6, 2017 at 10:35 AM, Jun Wu <quark@fb.com> wrote: > Excerpts from Martin von Zweigbergk's message of 2017-06-06 10:01:34 -0700: >> Here are some timings from a hopefully unbiased third party. These are >> best-of-20 times, in seconds. No extensions enabled. I ran >> "debugupdatecaches" before each series of commands to populate the >> caches (I think both obscache and the radixlink cache are hooked into >> that command). >> >> >> | At @ | obscache | radixlink | >> id -r @ | 0.70 | 0.38 | 0.42 | >> log -r @ | 1.60 | 1.08 | 0.59 | >> export @ | 0.71 | 0.33 | 0.59 | >> log -G -r 'draft()' | 1.19 | 1.22 | 0.74 | >> log -G -r 'draft()' \ | 1.26 | 1.27 | 0.97 | >> -T '{node} {troubles}' | | | | > > Note this is a repo with a small len(changelog) and I think cache rebuilding > time matters. Other parts of Mercurial are friendly to large len(changelog), > I think it's better to not break that property. Yes, that's a fair point. I considered timing cache rebuilding as well, but I didn't get to it. > >> We write Mercurial for users, so I've included commands similar to >> what I use somewhat often (plus "hg id" because Pierre-Yves and Jun >> like it). >> >> So radixlink seems to provide a nice speedup across the board. So I >> vote for getting that series in good enough shape to include it and >> then we can come *consider* adding obscache on top after. I know >> Pierre-Yves wanted to get the cachekey and dualsourcecache pieces in >> at least to simplify further work in the evolve extension, but I'm >> very reluctant to add otherwise unused code (more than just a few >> lines) to core for that reason. What do other reviewers think? >> [...]
On 06/06/2017 12:02 AM, Martin von Zweigbergk wrote: > On Sat, May 20, 2017 at 8:30 AM, Pierre-Yves David > <pierre-yves.david@ens-lyon.org> wrote: >> # HG changeset patch >> # User Pierre-Yves David <pierre-yves.david@octobus.net> >> # Date 1495191830 -7200 >> # Fri May 19 13:03:50 2017 +0200 >> # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a >> # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 >> # EXP-Topic obscache >> # Available At https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ >> # hg pull https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 >> obsstore: add a 'cachekey' method >> >> Parsing the full obsstore is slow, so cache that depends on obsstore content >> wants a way to know if the obsstore changed, and it this change was append only. >> >> For this purpose we introduce an official cachekey for the obsstore. This cache >> key work in a way similar to the '(tiprev, tipnode)' pair used for the >> changelog. We use the size of the obsstore file and the hash of its tail. That >> way, we can check if the obsstore grew and if the content we knew is still >> present in the obsstore. > > Jun suggested to me personally that we could use a UUID and the size > as a cache key. The UUID would be replaced only when the file was > stripped. It could be stored inside a fake obsmarker as the first > obsmarker in the file. That sounds like a good idea to me. What do you > think? We can't guarantee old clients doing strip will update that UID. In addition, this breaks the happen-onlyness of the obsstore. Rollback would also no longer be able to just retract the file size to perform its duty. So this approach will not work :-/ The currently used cache key (size + hash of last bits) is not perfect, but works fine so far. Is there specific concerns about it? > We could do the same for the changelog, but that would require a new > format and requirement, so that's a much bigger change. In short same apply to the obsstore. we need a format change to introduce an integrated cachekey. Actually, I'm hope to take advantage of greg next changelog format to sneak a more solid cache key in there. Cheers,
On Tue, Jun 6, 2017 at 11:04 AM, Pierre-Yves David <pierre-yves.david@ens-lyon.org> wrote: > > > On 06/06/2017 12:02 AM, Martin von Zweigbergk wrote: >> >> On Sat, May 20, 2017 at 8:30 AM, Pierre-Yves David >> <pierre-yves.david@ens-lyon.org> wrote: >>> >>> # HG changeset patch >>> # User Pierre-Yves David <pierre-yves.david@octobus.net> >>> # Date 1495191830 -7200 >>> # Fri May 19 13:03:50 2017 +0200 >>> # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a >>> # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 >>> # EXP-Topic obscache >>> # Available At >>> https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ >>> # hg pull >>> https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 >>> obsstore: add a 'cachekey' method >>> >>> Parsing the full obsstore is slow, so cache that depends on obsstore >>> content >>> wants a way to know if the obsstore changed, and it this change was >>> append only. >>> >>> For this purpose we introduce an official cachekey for the obsstore. This >>> cache >>> key work in a way similar to the '(tiprev, tipnode)' pair used for the >>> changelog. We use the size of the obsstore file and the hash of its tail. >>> That >>> way, we can check if the obsstore grew and if the content we knew is >>> still >>> present in the obsstore. >> >> >> Jun suggested to me personally that we could use a UUID and the size >> as a cache key. The UUID would be replaced only when the file was >> stripped. It could be stored inside a fake obsmarker as the first >> obsmarker in the file. That sounds like a good idea to me. What do you >> think? > > > We can't guarantee old clients doing strip will update that UID. There are old clients that strip the obsstore? I thought that was a new feature. > In addition, this breaks the happen-onlyness of the obsstore. What depends on the append-onlyness? > Rollback would > also no longer be able to just retract the file size to perform its duty. Good point. > > So this approach will not work :-/ > > The currently used cache key (size + hash of last bits) is not perfect, but > works fine so far. Is there specific concerns about it? > >> We could do the same for the changelog, but that would require a new >> format and requirement, so that's a much bigger change. > > > In short same apply to the obsstore. we need a format change to introduce an > integrated cachekey. > > Actually, I'm hope to take advantage of greg next changelog format to sneak > a more solid cache key in there. > > Cheers, > > -- > Pierre-Yves David
Excerpts from Martin von Zweigbergk's message of 2017-06-06 11:07:54 -0700: > There are old clients that strip the obsstore? I thought that was a new > feature. > > > In addition, this breaks the happen-onlyness of the obsstore. > > What depends on the append-onlyness? > > > Rollback would also no longer be able to just retract the file size to > > perform its duty. > > Good point. I think the UUID file could be a separate file like "obsstore.uuid". So a strip or rollback will do something like (in this order): 1. change uuid 2. striip obsstore 3. change uuid again This also works for changelog. For older clients that do not support uuid, maybe we can fallback to mtime check.
On 06/06/2017 10:55 PM, Martin von Zweigbergk wrote: > On Tue, Jun 6, 2017 at 10:39 AM, Martin von Zweigbergk > <martinvonz@google.com> wrote: >> On Tue, Jun 6, 2017 at 10:27 AM, Pierre-Yves David >> <pierre-yves.david@ens-lyon.org> wrote: >>> >>> >>> On 06/06/2017 06:01 PM, Martin von Zweigbergk wrote: >>>> >>>> Here are some timings from a hopefully unbiased third party. These are >>>> best-of-20 times, in seconds. No extensions enabled. I ran >>>> "debugupdatecaches" before each series of commands to populate the >>>> caches (I think both obscache and the radixlink cache are hooked into >>>> that command). >>> >>> >>> Can you provides timing for the combination of both. The two series are not >>> meant to compete but to collaborate. I've a merge available at: >>> >>> hg pull -r cf7fd13ead00 >>> https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ >> >> Sure: >> >> | At @ | obscache | radixlink | combined | >> id -r @ | 0.70 | 0.38 | 0.42 | 0.42 | >> log -r @ | 1.60 | 1.08 | 0.59 | 0.81 | >> export @ | 0.71 | 0.33 | 0.59 | 0.40 | >> log -G -r 'draft()' | 1.19 | 1.22 | 0.74 | 0.91 | >> log -G -r 'draft()' \ | 1.26 | 1.27 | 0.97 | 0.97 | >> -T '{node} {troubles}' | | | | | > > Pierre-Yves pointed out that I should run with "--config > experimental.evolution=all" to avoid loading the obsstore just to > print a warning about it. Here are the updated figures: > > | At @ | obscache | radixlink | combined | > log -r @ | 1.14 | 1.05 | 0.33 | 0.63 | > export @ | 0.73 | 0.11 | 0.33 | 0.12 | > log -G -r 'draft()' | 1.58 | 1.22 | 0.76 | 0.50 | > log -G -r 'draft()' \ | 1.27 | 1.16 | 0.46 | 0.70 | > -T '{node} {troubles}' | | | | | > id -r @ | 0.81 | 0.11 | 0.67 | 0.11 | > log -G -r @ \ | 0.70 | 0.10 | 0.29 | 0.10 | > -T '{node} {desc}' | | | | | > > > I have no idea why "log -r @" got slower without radixlink than it was > without "--config experimental.evolution=all". After pointing that out, I started playing with benchmark a bit. Since I've them ready I'm adding them to the thread. I used one extra digit to have a bit more meaningful number command | current | obscache | indexes | combined ---------------------------|---------|----------|---------|--------- id -r @ | 0.589 | 0.088 | 0.108 | 0.088 log -r @ | 0.945 | 0.890 | 0.183 | 0.169 diff | 0.068 | 0.067 | 0.068 | 0.066 diff -r .~1 | 0.634 | 0.114 | 0.146 | 0.142 export @ | 0.600 | 0.098 | 0.122 | 0.102 log -G -r draft() | 1.073 | 0.972 | 0.285 | 0.265 log -G -r "draft()"\ | 0.960 | 0.991 | 0.273 | 0.239 -T "{node} {troubles}" | | | | log -G -r tip\ | 0.593 | 0.085 | 0.102 | 0.085 -T "{node} {desc}" | | | | I've attached the script used to get these number if people wants to reproduce some number on their side. Cheers,
On 06/06/2017 07:07 PM, Martin von Zweigbergk wrote: > On Tue, Jun 6, 2017 at 11:04 AM, Pierre-Yves David > <pierre-yves.david@ens-lyon.org> wrote: >> >> >> On 06/06/2017 12:02 AM, Martin von Zweigbergk wrote: >>> >>> On Sat, May 20, 2017 at 8:30 AM, Pierre-Yves David >>> <pierre-yves.david@ens-lyon.org> wrote: >>>> >>>> # HG changeset patch >>>> # User Pierre-Yves David <pierre-yves.david@octobus.net> >>>> # Date 1495191830 -7200 >>>> # Fri May 19 13:03:50 2017 +0200 >>>> # Node ID 221be1ef98902fa695a709371f75e63f9b3e950a >>>> # Parent 566cfe9cbbb9b163bb58c8666759a634badacdd7 >>>> # EXP-Topic obscache >>>> # Available At >>>> https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ >>>> # hg pull >>>> https://www.mercurial-scm.org/repo/users/marmoute/mercurial/ -r 221be1ef9890 >>>> obsstore: add a 'cachekey' method >>>> >>>> Parsing the full obsstore is slow, so cache that depends on obsstore >>>> content >>>> wants a way to know if the obsstore changed, and it this change was >>>> append only. >>>> >>>> For this purpose we introduce an official cachekey for the obsstore. This >>>> cache >>>> key work in a way similar to the '(tiprev, tipnode)' pair used for the >>>> changelog. We use the size of the obsstore file and the hash of its tail. >>>> That >>>> way, we can check if the obsstore grew and if the content we knew is >>>> still >>>> present in the obsstore. >>> >>> >>> Jun suggested to me personally that we could use a UUID and the size >>> as a cache key. The UUID would be replaced only when the file was >>> stripped. It could be stored inside a fake obsmarker as the first >>> obsmarker in the file. That sounds like a good idea to me. What do you >>> think? >> >> >> We can't guarantee old clients doing strip will update that UID. > > There are old clients that strip the obsstore? I thought that was a new feature. There have been a couple more arcane way to do it for some times (debugobsolete). Also: rollback. >> In addition, this breaks the happen-onlyness of the obsstore. > > What depends on the append-onlyness? Rollback for one. Also the current format is defined as append only and used by such by a couple of things (eg: the evolve extension). We can introduces a new format (and that will likely happens in the coming years) but it is not clear that dropping the append only property is good idea. It make things simpler, more scalable and more cache friendly. The current cache is purely append only (size + hash of N last bytes). A way I can see to make it flawless in the new format is to have small "slot" at least every N bytes that does not contains actual data but a small hash of the previous N bytes. That keeps the nice append only property while ensuring the full content is reflected in the hash. Cheers
On 06/06/2017 06:01 PM, Martin von Zweigbergk wrote: > Here are some timings from a hopefully unbiased third party. These are > best-of-20 times, in seconds. No extensions enabled. I ran > "debugupdatecaches" before each series of commands to populate the > caches (I think both obscache and the radixlink cache are hooked into > that command). > > > | At @ | obscache | radixlink | > id -r @ | 0.70 | 0.38 | 0.42 | > log -r @ | 1.60 | 1.08 | 0.59 | > export @ | 0.71 | 0.33 | 0.59 | > log -G -r 'draft()' | 1.19 | 1.22 | 0.74 | > log -G -r 'draft()' \ | 1.26 | 1.27 | 0.97 | > -T '{node} {troubles}' | | | | > > We write Mercurial for users, so I've included commands similar to > what I use somewhat often (plus "hg id" because Pierre-Yves and Jun > like it). > > So radixlink seems to provide a nice speedup across the board. So I > vote for getting that series in good enough shape to include it and > then we can come *consider* adding obscache on top after. I know > Pierre-Yves wanted to get the cachekey and dualsourcecache pieces in > at least to simplify further work in the evolve extension, but I'm > very reluctant to add otherwise unused code (more than just a few > lines) to core for that reason. What do other reviewers think? I'm not advocating for introducing dead code. The obsstore indexes(radix-link) are a cache layer to the obsstore. As a cache, the indexes need a cache-key and associated logic for incremental update. What I need here is to have this logic "standard" and reusable by other caches. I can split the obsstore specific part from the changelog part and make it available, either used by the obscache or by the indexes note: current indexes cache key is fragile (file length check only) and would benefit from the stronger cachekey+logic) available in this series. Cheers,
Excerpts from Pierre-Yves David's message of 2017-06-07 15:27:41 +0100: > I'm not advocating for introducing dead code. The obsstore > indexes(radix-link) are a cache layer to the obsstore. As a cache, the > indexes need a cache-key and associated logic for incremental update. The problem I see for your cache-key approach is it has to have length information to be able to do incremental update. So it's not a typical cache key. I'd prefer the UUID + length approach. We have "length" implemented today. > What I need here is to have this logic "standard" and reusable by other > caches. I can split the obsstore specific part from the changelog part > and make it available, either used by the obscache or by the indexes > > note: current indexes cache key is fragile (file length check only) and > would benefit from the stronger cachekey+logic) available in this series. I'm not sure checking 200 bytes (in your current approach) is not fragile. > Cheers, >
On 06/07/2017 03:57 PM, Jun Wu wrote: > Excerpts from Pierre-Yves David's message of 2017-06-07 15:27:41 +0100: >> I'm not advocating for introducing dead code. The obsstore >> indexes(radix-link) are a cache layer to the obsstore. As a cache, the >> indexes need a cache-key and associated logic for incremental update. > > The problem I see for your cache-key approach is it has to have length > information to be able to do incremental update. So it's not a typical cache > key. I'm not sure what you mean here. The cache key used to validate the cache and detect changes is only: <size-of-obsstore, hash-from-content> This is based on the same things as the cache key you uses for indexes (size of obsstore); with an additional content checks (hash) in my proposal. As of this series was written, directly obtaining markers from an (old, new) offset of the obsstore is not possible. The use of "number of obsmarkers" by the abstract cache is here to work around this. One of the goal of having the code in core was to follow up with a rework of the obsmarkers reading API to allow such partial read and drop the "number of markers" hack. There a comment point this out in the code I sent. I see that you have done in your indexes series all the refactoring that I needed (Mostly patches 5,6 and 12). Thanks you for that. (Note: evolve grown a hacky version of this some weeks back and is not longer using the "number markers" in practice: https://www.mercurial-scm.org/repo/evolve/file/85cabd631a1b/hgext3rd/evolve/obscache.py#l130 ) In addition, I'm not sure what you mean by "not typical". Most cache in Mercurial have "tiprev" as part of the cachedkey and uses to compute their incremental update. > I'd prefer the UUID + length approach. We have "length" implemented today. The UUID approach requires breaking append-onlyness or using multiple file. It is less friendly with to (including wide transaction rollback) and races are gets annoying to avoid. The append-only based approach do not suffer from these drawback making it more appealing. >> What I need here is to have this logic "standard" and reusable by other >> caches. I can split the obsstore specific part from the changelog part >> and make it available, either used by the obscache or by the indexes >> >> note: current indexes cache key is fragile (file length check only) and >> would benefit from the stronger cachekey+logic) available in this series. > > I'm not sure checking 200 bytes (in your current approach) is not fragile. It is less fragile. Obsmarkers created in the same repo has a high odd to have the same size, the bytes hashing is catching content change. We can increase the number of bytes we read a bit to reinforce it if this turn out to be too low. See my other email for idea on how to make it more complete: https://www.mercurial-scm.org/pipermail/mercurial-devel/2017-June/099317.html Cheers,
Excerpts from Pierre-Yves David's message of 2017-06-07 18:04:22 +0100: > > On 06/07/2017 03:57 PM, Jun Wu wrote: > > Excerpts from Pierre-Yves David's message of 2017-06-07 15:27:41 +0100: > >> I'm not advocating for introducing dead code. The obsstore > >> indexes(radix-link) are a cache layer to the obsstore. As a cache, the > >> indexes need a cache-key and associated logic for incremental update. > > > > The problem I see for your cache-key approach is it has to have length > > information to be able to do incremental update. So it's not a typical cache > > key. > > I'm not sure what you mean here. The cache key used to validate the > cache and detect changes is only: > > <size-of-obsstore, hash-from-content> > > This is based on the same things as the cache key you uses for indexes > (size of obsstore); with an additional content checks (hash) in my proposal. I will not call that a "cache key". I think the common understanding of a "cache key" is, it only supports Eq operation and does not support PartialEq. The UUID (or whatever ID) will be a cleaner concept - if the ID changes, invalidate the whole cache. > As of this series was written, directly obtaining markers from an (old, > new) offset of the obsstore is not possible. The use of "number of > obsmarkers" by the abstract cache is here to work around this. One of > the goal of having the code in core was to follow up with a rework of > the obsmarkers reading API to allow such partial read and drop the > "number of markers" hack. There a comment point this out in the code I sent. > > I see that you have done in your indexes series all the refactoring that > I needed (Mostly patches 5,6 and 12). Thanks you for that. > > (Note: evolve grown a hacky version of this some weeks back and is not > longer using the "number markers" in practice: > > https://www.mercurial-scm.org/repo/evolve/file/85cabd631a1b/hgext3rd/evolve/obscache.py#l130 > ) > > In addition, I'm not sure what you mean by "not typical". Most cache in > Mercurial have "tiprev" as part of the cachedkey and uses to compute > their incremental update. > > > I'd prefer the UUID + length approach. We have "length" implemented today. > > The UUID approach requires breaking append-onlyness or using multiple > file. It is less friendly with to (including wide transaction rollback) > and races are gets annoying to avoid. Could you give an example of how race could happen and why is it not append-only if the id is stored in a separate file? I think the only real problematic case is old clients. > The append-only based approach do not suffer from these drawback making > it more appealing. > > >> What I need here is to have this logic "standard" and reusable by other > >> caches. I can split the obsstore specific part from the changelog part > >> and make it available, either used by the obscache or by the indexes > >> > >> note: current indexes cache key is fragile (file length check only) and > >> would benefit from the stronger cachekey+logic) available in this series. > > > > I'm not sure checking 200 bytes (in your current approach) is not fragile. > > It is less fragile. Obsmarkers created in the same repo has a high odd > to have the same size, the bytes hashing is catching content change. > We can increase the number of bytes we read a bit to reinforce it if > this turn out to be too low. But it is still problematic. Do you agree? > See my other email for idea on how to make it more complete: > > https://www.mercurial-scm.org/pipermail/mercurial-devel/2017-June/099317.html > > Cheers, >
On Wed, Jun 7, 2017 at 10:27 AM, Jun Wu <quark@fb.com> wrote: > Excerpts from Pierre-Yves David's message of 2017-06-07 18:04:22 +0100: > > > > On 06/07/2017 03:57 PM, Jun Wu wrote: > > > Excerpts from Pierre-Yves David's message of 2017-06-07 15:27:41 +0100: > > >> I'm not advocating for introducing dead code. The obsstore > > >> indexes(radix-link) are a cache layer to the obsstore. As a cache, the > > >> indexes need a cache-key and associated logic for incremental update. > > > > > > The problem I see for your cache-key approach is it has to have length > > > information to be able to do incremental update. So it's not a typical > cache > > > key. > > > > I'm not sure what you mean here. The cache key used to validate the > > cache and detect changes is only: > > > > <size-of-obsstore, hash-from-content> > > > > This is based on the same things as the cache key you uses for indexes > > (size of obsstore); with an additional content checks (hash) in my > proposal. > > I will not call that a "cache key". I think the common understanding of a > "cache key" is, it only supports Eq operation and does not support > PartialEq. > > The UUID (or whatever ID) will be a cleaner concept - if the ID changes, > invalidate the whole cache. > > > As of this series was written, directly obtaining markers from an (old, > > new) offset of the obsstore is not possible. The use of "number of > > obsmarkers" by the abstract cache is here to work around this. One of > > the goal of having the code in core was to follow up with a rework of > > the obsmarkers reading API to allow such partial read and drop the > > "number of markers" hack. There a comment point this out in the code I > sent. > > > > I see that you have done in your indexes series all the refactoring that > > I needed (Mostly patches 5,6 and 12). Thanks you for that. > > > > (Note: evolve grown a hacky version of this some weeks back and is not > > longer using the "number markers" in practice: > > > > https://www.mercurial-scm.org/repo/evolve/file/85cabd631a1b/ > hgext3rd/evolve/obscache.py#l130 > > ) > > > > In addition, I'm not sure what you mean by "not typical". Most cache in > > Mercurial have "tiprev" as part of the cachedkey and uses to compute > > their incremental update. > > > > > I'd prefer the UUID + length approach. We have "length" implemented > today. > > > > The UUID approach requires breaking append-onlyness or using multiple > > file. It is less friendly with to (including wide transaction rollback) > > and races are gets annoying to avoid. > > Could you give an example of how race could happen and why is it not > append-only if the id is stored in a separate file? > > I think the only real problematic case is old clients. > > > The append-only based approach do not suffer from these drawback making > > it more appealing. > > > > >> What I need here is to have this logic "standard" and reusable by > other > > >> caches. I can split the obsstore specific part from the changelog part > > >> and make it available, either used by the obscache or by the indexes > > >> > > >> note: current indexes cache key is fragile (file length check only) > and > > >> would benefit from the stronger cachekey+logic) available in this > series. > > > > > > I'm not sure checking 200 bytes (in your current approach) is not > fragile. > > > > It is less fragile. Obsmarkers created in the same repo has a high odd > > to have the same size, the bytes hashing is catching content change. > > We can increase the number of bytes we read a bit to reinforce it if > > this turn out to be too low. > > But it is still problematic. Do you agree? > > > See my other email for idea on how to make it more complete: > > > > https://www.mercurial-scm.org/pipermail/mercurial-devel/2017 > -June/099317.html This thread and the larger conversation around obsolescence perf improvements is massive. While it appears that radixlink and obscache are complementary proposals, they both aim to achieve similar end results. It isn't clear to me that both of them will be needed. Given: a) the radixlink work has use cases outside of obsolescence b) there are reports from Facebook that obscache isn't huge repo friendly and therefore not as ready as radixlink c) it appears that the obscache feature can be implemented out of core (presumably in the evolve extension) more easily than radixlink can d) obscache currently only seems to provide incremental performance wins over radixlink for non-trivial complexity and at least 1 reviewer is already expressed hesitations ( https://www.mercurial-scm.org/pipermail/mercurial-devel/2017-June/099040.html ) I favor tabling this obscache series and focusing on landing radixlink. Best case is we do that, obsolescence scales well enough, we're done, and we can use radixlink for other things. Worst case is it doesn't pan out, we rip it out of obsolscence once something better comes along. Realistically, it lands a perf win today, frees up Jun :), unblocks other radixlink work, and Pierre-Yves later demonstrates how things could be even better, we listen to him, and accept his patches, possibly before the 4.3 release. I hate saying "no" (really: "just put this on hold for a week or two") to someone and playing kingmaker. But I feel the debate is causing too much "stop energy" and is demotivating everyone. I prefer we fall forward and make some progress, even if it isn't perfect. And to maximize the chances of forward progress, I think we need to focus on one solution to obsolescence performance at a time. I perceive the implementation with the most value to be radixlink because it can be used outside obsolescence and has already demonstrated merit over obscache on huge repos.
Patch
diff --git a/mercurial/obsolete.py b/mercurial/obsolete.py --- a/mercurial/obsolete.py +++ b/mercurial/obsolete.py @@ -70,6 +70,7 @@ comment associated with each format for from __future__ import absolute_import import errno +import hashlib import struct from .i18n import _ @@ -547,6 +548,8 @@ class obsstore(object): # parents: (tuple of nodeid) or None, parents of precursors # None is used when no data has been recorded + _obskeysize = 200 + def __init__(self, svfs, defaultformat=_fm1version, readonly=False): # caches for various obsolescence related cache self.caches = {} @@ -574,6 +577,46 @@ class obsstore(object): __bool__ = __nonzero__ + def cachekey(self, index=None): + """return (current-length, cachekey) + + 'current-length': is the current length of the obsstore storage file, + 'cachekey' is the hash of the last 200 bytes ending at 'index'. + + If 'index' is unspecified, current obsstore length is used. + Cacheckey will be set to nullid if the obsstore is empty. + 'current-lenght' is -always- the current obsstore length, regardless of + the 'index' value. + + If the index specified is higher than the current obsstore file + length, cachekey will be set to None.""" + # default value + obsstoresize = 0 + keydata = '' + # try to get actual data from the obsstore + try: + with self.svfs('obsstore') as obsfile: + obsfile.seek(0, 2) + obsstoresize = obsfile.tell() + if index is None: + index = obsstoresize + elif obsstoresize < index: + return obsstoresize, None + actualsize = min(index, self._obskeysize) + if actualsize: + obsfile.seek(index - actualsize, 0) + keydata = obsfile.read(actualsize) + except (OSError, IOError) as e: + if e.errno != errno.ENOENT: + raise + if keydata: + key = hashlib.sha1(keydata).digest() + else: + # reusing an existing "empty" value make it easier to define a + # default cachekey for 'no data'. + key = node.nullid + return obsstoresize, key + @property def readonly(self): """True if marker creation is disabled