Submitter | Mads Kiilerich |
---|---|
Date | Oct. 15, 2014, 6:14 p.m. |
Message ID | <bf7a0169677c0545a63e.1413396862@ssl.google-analytics.com> |
Download | mbox | patch |
Permalink | /patch/6278/ |
State | Superseded |
Headers | show |
Comments
On Wed, Oct 15, 2014 at 08:14:22PM +0200, Mads Kiilerich wrote: > # HG changeset patch > # User Mads Kiilerich <madski@unity3d.com> > # Date 1413396806 -7200 > # Wed Oct 15 20:13:26 2014 +0200 > # Node ID bf7a0169677c0545a63e64690b0e49e50b376703 > # Parent 48c0b101a9de1fdbd638daa858da845cd05a6be7 > localrepo: persistent caching of branch names > > It is expensive to retrieve the branch name. Very expensive when creating a > changectx and calling .branch() - slightly less when using > changelog.branchinfo(). > > Now, to really speed things up, cache the results on disk. For each repo > revision store the node hash and a reference to the branch name. To avoid using > too much space, each branch name is only stored once. To make it 100% stable > against repository mutations, always check the node hash before using the > cache content. So we only store one entry for each branch? Maybe we could do something more LRU-ish to have bounded storage but still get more performance benefit? > > This change promise to speed some operations up 4-6 times when it actually is > used. Can you include some before/after on some operations in the log message? > > A simpler approach that didn't store and validate node hashes for every > revision had a 20 x speedup but could be tricked when modifying history. It > would usually reset the cache, but when trying very hard it could be tricked > into not noticing changes. Can you elaborate on this? I'm confused how that would happen (maybe an old hg doing history editing?) > > diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py > --- a/mercurial/localrepo.py > +++ b/mercurial/localrepo.py > @@ -6,7 +6,7 @@ > # GNU General Public License version 2 or any later version. > from node import hex, nullid, short > from i18n import _ > -import urllib > +import urllib, struct, array > import peer, changegroup, subrepo, pushkey, obsolete, repoview > import changelog, dirstate, filelog, manifest, context, bookmarks, phases > import lock as lockmod > @@ -21,6 +21,14 @@ import branchmap, pathutil > propertycache = util.propertycache > filecache = scmutil.filecache > > +# branch name caching > +bcfilename = 'cache/branchnames' > +bcversion = 2345164374 > +bcheadfmt = '>LLL' > +bcheadsize = struct.calcsize(bcheadfmt) > +bcrecfmt = '>20sH' > +bcrecsize = struct.calcsize(bcrecfmt) > + > class repofilecache(filecache): > """All filecache usage on repo are done for logic that should be unfiltered > """ > @@ -179,6 +187,7 @@ class localrepository(object): > openerreqs = set(('revlogv1', 'generaldelta')) > requirements = ['revlogv1'] > filtername = None > + _branchcachedirty = None > > # a list of (ui, featureset) functions. > # only functions defined in module of enabled extensions are invoked > @@ -298,7 +307,7 @@ class localrepository(object): > self.filteredrevcache = {} > > def close(self): > - pass > + self._branchcachesave() > > def _restrictcapabilities(self, caps): > # bundle2 is not ready for prime time, drop it unless explicitly > @@ -723,6 +732,74 @@ class localrepository(object): > repo = (remote and remote.local()) and remote or self > return repo[key].branch() > > + def _branchcacheload(self): > + """Load cached branch values.""" > + try: > + data = self.vfs.open(bcfilename).read() > + except IOError: > + data = '' > + > + self._branches = [] > + self._branchrecs = array.array('c') # bytes of struct type bcrecfmt > + self.__dict__['_branchcachedirty'] = True Why are you poking __dict__ instead of just assigning to the property? That at least merits a comment, but if I had to guess it more merits a cleanup of some logic. > + reporecslen = len(self) * bcrecsize > + if len(data) >= bcheadsize: > + v, recsstart, recslen = struct.unpack_from(bcheadfmt, data) > + if v == bcversion and len(data) == recsstart + recslen: > + if recsstart: > + self._branches = \ > + data[bcheadsize:recsstart].split('\0') > + self._branchrecs.fromstring( > + buffer(data, recsstart, min(recslen, reporecslen))) > + self.__dict__['_branchcachedirty'] = recslen > reporecslen > + else: > + self.ui.debug('branch cache file was invalid\n') > + > + if len(self._branchrecs) < reporecslen: > + self._branchrecs.extend( > + '\xff' * (reporecslen - len(self._branchrecs))) > + > + self._branchnamesindex = dict((b, r) > + for r, b in enumerate(self._branches)) > + > + def branch(self, rev): > + """return branch name of rev, using and updating persistent cache.""" > + if self._branchcachedirty is None: I can't say I'm a fan of the None/False/True three-state boolean thing going on here. Can we do better somehow? Perhaps by moving the branchcache into its own object rather than wedging it into repofilecache? > + self._branchcacheload() > + > + node = self.changelog.node(rev) > + cachenode, branchidx = struct.unpack_from(bcrecfmt, self._branchrecs, > + rev * bcrecsize) > + if cachenode == node and branchidx < len(self._branches): > + return self._branches[branchidx] > + b, _close = self.changelog.branchinfo(rev) > + if b in self._branchnamesindex: > + branchidx = self._branchnamesindex[b] > + else: > + branchidx = len(self._branches) > + self._branches.append(b) > + self._branchnamesindex[b] = branchidx > + struct.pack_into(bcrecfmt, self._branchrecs, rev * bcrecsize, > + node, branchidx) > + self.__dict__['_branchcachedirty'] = True > + return b > + > + def _branchcachesave(self): > + """save branch cache if it is dirty""" > + if self._branchcachedirty: > + self.ui.debug('writing branch cache file\n') > + try: > + f = self.vfs.open(bcfilename, 'w', atomictemp=True) > + s = '\0'.join(self._branches) > + f.write(struct.pack(bcheadfmt, bcversion, > + bcheadsize + len(s), len(self._branchrecs))) > + f.write(s) > + f.write(self._branchrecs) > + f.close() > + except IOError: > + pass > + self.__dict__['_branchcachedirty'] = False > + > def known(self, nodes): > nm = self.changelog.nodemap > pc = self._phasecache > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@selenic.com > http://selenic.com/mailman/listinfo/mercurial-devel
On Wed, Oct 15, 2014 at 08:14:22PM +0200, Mads Kiilerich wrote: > # HG changeset patch > # User Mads Kiilerich <madski@unity3d.com> > # Date 1413396806 -7200 > # Wed Oct 15 20:13:26 2014 +0200 > # Node ID bf7a0169677c0545a63e64690b0e49e50b376703 > # Parent 48c0b101a9de1fdbd638daa858da845cd05a6be7 > localrepo: persistent caching of branch names > > It is expensive to retrieve the branch name. Very expensive when creating a > changectx and calling .branch() - slightly less when using > changelog.branchinfo(). > > Now, to really speed things up, cache the results on disk. For each repo > revision store the node hash and a reference to the branch name. To avoid using > too much space, each branch name is only stored once. To make it 100% stable > against repository mutations, always check the node hash before using the > cache content. > > This change promise to speed some operations up 4-6 times when it actually is > used. > > A simpler approach that didn't store and validate node hashes for every > revision had a 20 x speedup but could be tricked when modifying history. It > would usually reset the cache, but when trying very hard it could be tricked > into not noticing changes. > > diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py > --- a/mercurial/localrepo.py > +++ b/mercurial/localrepo.py > @@ -6,7 +6,7 @@ > # GNU General Public License version 2 or any later version. > from node import hex, nullid, short > from i18n import _ > -import urllib > +import urllib, struct, array > import peer, changegroup, subrepo, pushkey, obsolete, repoview > import changelog, dirstate, filelog, manifest, context, bookmarks, phases > import lock as lockmod > @@ -21,6 +21,14 @@ import branchmap, pathutil > propertycache = util.propertycache > filecache = scmutil.filecache > > +# branch name caching > +bcfilename = 'cache/branchnames' > +bcversion = 2345164374 > +bcheadfmt = '>LLL' > +bcheadsize = struct.calcsize(bcheadfmt) > +bcrecfmt = '>20sH' > +bcrecsize = struct.calcsize(bcrecfmt) > + > class repofilecache(filecache): > """All filecache usage on repo are done for logic that should be unfiltered > """ > @@ -179,6 +187,7 @@ class localrepository(object): > openerreqs = set(('revlogv1', 'generaldelta')) > requirements = ['revlogv1'] > filtername = None > + _branchcachedirty = None > > # a list of (ui, featureset) functions. > # only functions defined in module of enabled extensions are invoked > @@ -298,7 +307,7 @@ class localrepository(object): Oh, and I'm not enthusiastic about stuffing new behavior into localrepo. I'm open to suggestions on how we can better factor this. > self.filteredrevcache = {} > > def close(self): > - pass > + self._branchcachesave() > > def _restrictcapabilities(self, caps): > # bundle2 is not ready for prime time, drop it unless explicitly > @@ -723,6 +732,74 @@ class localrepository(object): > repo = (remote and remote.local()) and remote or self > return repo[key].branch() > > + def _branchcacheload(self): > + """Load cached branch values.""" > + try: > + data = self.vfs.open(bcfilename).read() > + except IOError: > + data = '' > + > + self._branches = [] > + self._branchrecs = array.array('c') # bytes of struct type bcrecfmt > + self.__dict__['_branchcachedirty'] = True > + reporecslen = len(self) * bcrecsize > + if len(data) >= bcheadsize: > + v, recsstart, recslen = struct.unpack_from(bcheadfmt, data) > + if v == bcversion and len(data) == recsstart + recslen: > + if recsstart: > + self._branches = \ > + data[bcheadsize:recsstart].split('\0') > + self._branchrecs.fromstring( > + buffer(data, recsstart, min(recslen, reporecslen))) > + self.__dict__['_branchcachedirty'] = recslen > reporecslen > + else: > + self.ui.debug('branch cache file was invalid\n') > + > + if len(self._branchrecs) < reporecslen: > + self._branchrecs.extend( > + '\xff' * (reporecslen - len(self._branchrecs))) > + > + self._branchnamesindex = dict((b, r) > + for r, b in enumerate(self._branches)) > + > + def branch(self, rev): > + """return branch name of rev, using and updating persistent cache.""" > + if self._branchcachedirty is None: > + self._branchcacheload() > + > + node = self.changelog.node(rev) > + cachenode, branchidx = struct.unpack_from(bcrecfmt, self._branchrecs, > + rev * bcrecsize) > + if cachenode == node and branchidx < len(self._branches): > + return self._branches[branchidx] > + b, _close = self.changelog.branchinfo(rev) > + if b in self._branchnamesindex: > + branchidx = self._branchnamesindex[b] > + else: > + branchidx = len(self._branches) > + self._branches.append(b) > + self._branchnamesindex[b] = branchidx > + struct.pack_into(bcrecfmt, self._branchrecs, rev * bcrecsize, > + node, branchidx) > + self.__dict__['_branchcachedirty'] = True > + return b > + > + def _branchcachesave(self): > + """save branch cache if it is dirty""" > + if self._branchcachedirty: > + self.ui.debug('writing branch cache file\n') > + try: > + f = self.vfs.open(bcfilename, 'w', atomictemp=True) > + s = '\0'.join(self._branches) > + f.write(struct.pack(bcheadfmt, bcversion, > + bcheadsize + len(s), len(self._branchrecs))) > + f.write(s) > + f.write(self._branchrecs) > + f.close() > + except IOError: > + pass > + self.__dict__['_branchcachedirty'] = False > + > def known(self, nodes): > nm = self.changelog.nodemap > pc = self._phasecache > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@selenic.com > http://selenic.com/mailman/listinfo/mercurial-devel
On 10/15/2014 08:32 PM, Augie Fackler wrote: > On Wed, Oct 15, 2014 at 08:14:22PM +0200, Mads Kiilerich wrote: >> # HG changeset patch >> # User Mads Kiilerich <madski@unity3d.com> >> # Date 1413396806 -7200 >> # Wed Oct 15 20:13:26 2014 +0200 >> # Node ID bf7a0169677c0545a63e64690b0e49e50b376703 >> # Parent 48c0b101a9de1fdbd638daa858da845cd05a6be7 >> localrepo: persistent caching of branch names >> >> It is expensive to retrieve the branch name. Very expensive when creating a >> changectx and calling .branch() - slightly less when using >> changelog.branchinfo(). >> >> Now, to really speed things up, cache the results on disk. For each repo >> revision store the node hash and a reference to the branch name. To avoid using >> too much space, each branch name is only stored once. To make it 100% stable >> against repository mutations, always check the node hash before using the >> cache content. > So we only store one entry for each branch? Maybe we could do > something more LRU-ish to have bounded storage but still get more > performance benefit? Eh. We only store the name of each branch once in a list. For each revision we store the index of the corresponding branch name. A hg log -r 'branch(stable)' has to retrieve the branch name of every single revision in the repository. (That could be avoided by having clever indices that are closely integrated with the revset query plans ... but that sounds like a project for another year. This cache can help until we get there.) I don't think anything LRU-ish can help. >> This change promise to speed some operations up 4-6 times when it actually is >> used. > Can you include some before/after on some operations in the log message? The use of caches do that measurements doesn't tell much. It is not just an optimization but a different complexity / cost profile. But I could add some. > >> A simpler approach that didn't store and validate node hashes for every >> revision had a 20 x speedup but could be tricked when modifying history. It >> would usually reset the cache, but when trying very hard it could be tricked >> into not noticing changes. > Can you elaborate on this? I'm confused how that would happen (maybe > an old hg doing history editing?) That is a description of the straw man proposal in v1 of this patch. It did not verify all hashes, only the tip most - like branch head caches do. The worst case could happen if someone is stripping the tipmost revision to uncover a non-parent revision, amending the new tip to use another branch, and pull the stripped tip revision back. That will fool the branch head cache and v1 of this patch. We can't guarantee nothing changed unless we check everything ... or rely on someone telling us. > >> diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py >> --- a/mercurial/localrepo.py >> +++ b/mercurial/localrepo.py >> @@ -6,7 +6,7 @@ >> # GNU General Public License version 2 or any later version. >> from node import hex, nullid, short >> from i18n import _ >> -import urllib >> +import urllib, struct, array >> import peer, changegroup, subrepo, pushkey, obsolete, repoview >> import changelog, dirstate, filelog, manifest, context, bookmarks, phases >> import lock as lockmod >> @@ -21,6 +21,14 @@ import branchmap, pathutil >> propertycache = util.propertycache >> filecache = scmutil.filecache >> >> +# branch name caching >> +bcfilename = 'cache/branchnames' >> +bcversion = 2345164374 >> +bcheadfmt = '>LLL' >> +bcheadsize = struct.calcsize(bcheadfmt) >> +bcrecfmt = '>20sH' >> +bcrecsize = struct.calcsize(bcrecfmt) >> + >> class repofilecache(filecache): >> """All filecache usage on repo are done for logic that should be unfiltered >> """ >> @@ -179,6 +187,7 @@ class localrepository(object): >> openerreqs = set(('revlogv1', 'generaldelta')) >> requirements = ['revlogv1'] >> filtername = None >> + _branchcachedirty = None >> >> # a list of (ui, featureset) functions. >> # only functions defined in module of enabled extensions are invoked >> @@ -298,7 +307,7 @@ class localrepository(object): >> self.filteredrevcache = {} >> >> def close(self): >> - pass >> + self._branchcachesave() >> >> def _restrictcapabilities(self, caps): >> # bundle2 is not ready for prime time, drop it unless explicitly >> @@ -723,6 +732,74 @@ class localrepository(object): >> repo = (remote and remote.local()) and remote or self >> return repo[key].branch() >> >> + def _branchcacheload(self): >> + """Load cached branch values.""" >> + try: >> + data = self.vfs.open(bcfilename).read() >> + except IOError: >> + data = '' >> + >> + self._branches = [] >> + self._branchrecs = array.array('c') # bytes of struct type bcrecfmt >> + self.__dict__['_branchcachedirty'] = True > Why are you poking __dict__ instead of just assigning to the property? > That at least merits a comment, but if I had to guess it more merits a > cleanup of some logic. I wish I knew. It is related to the filtering crazyness. There is a comment elsewhere: if '_tagscache' in vars(self): # can't use delattr on proxy del self.__dict__['_tagscache'] Doing it the obvious way doesn't work. It is possible that some combination of .filtered and .unfiltered could work. I could add a comment like that every time I have to use that hack, but I guess it is or should be common knowledge that localrepo is special. > >> + reporecslen = len(self) * bcrecsize >> + if len(data) >= bcheadsize: >> + v, recsstart, recslen = struct.unpack_from(bcheadfmt, data) >> + if v == bcversion and len(data) == recsstart + recslen: >> + if recsstart: >> + self._branches = \ >> + data[bcheadsize:recsstart].split('\0') >> + self._branchrecs.fromstring( >> + buffer(data, recsstart, min(recslen, reporecslen))) >> + self.__dict__['_branchcachedirty'] = recslen > reporecslen >> + else: >> + self.ui.debug('branch cache file was invalid\n') >> + >> + if len(self._branchrecs) < reporecslen: >> + self._branchrecs.extend( >> + '\xff' * (reporecslen - len(self._branchrecs))) >> + >> + self._branchnamesindex = dict((b, r) >> + for r, b in enumerate(self._branches)) >> + >> + def branch(self, rev): >> + """return branch name of rev, using and updating persistent cache.""" >> + if self._branchcachedirty is None: > I can't say I'm a fan of the None/False/True three-state boolean thing > going on here. Can we do better somehow? Perhaps by moving the > branchcache into its own object rather than wedging it into > repofilecache? Doing it that way would add an extra indirection and we do not like classes. It would have to be non-intrusive when constructing localrepos but efficient when using them. I can try to wrap it around and come up with something. /Mads > >> + self._branchcacheload() >> + >> + node = self.changelog.node(rev) >> + cachenode, branchidx = struct.unpack_from(bcrecfmt, self._branchrecs, >> + rev * bcrecsize) >> + if cachenode == node and branchidx < len(self._branches): >> + return self._branches[branchidx] >> + b, _close = self.changelog.branchinfo(rev) >> + if b in self._branchnamesindex: >> + branchidx = self._branchnamesindex[b] >> + else: >> + branchidx = len(self._branches) >> + self._branches.append(b) >> + self._branchnamesindex[b] = branchidx >> + struct.pack_into(bcrecfmt, self._branchrecs, rev * bcrecsize, >> + node, branchidx) >> + self.__dict__['_branchcachedirty'] = True >> + return b >> + >> + def _branchcachesave(self): >> + """save branch cache if it is dirty""" >> + if self._branchcachedirty: >> + self.ui.debug('writing branch cache file\n') >> + try: >> + f = self.vfs.open(bcfilename, 'w', atomictemp=True) >> + s = '\0'.join(self._branches) >> + f.write(struct.pack(bcheadfmt, bcversion, >> + bcheadsize + len(s), len(self._branchrecs))) >> + f.write(s) >> + f.write(self._branchrecs) >> + f.close() >> + except IOError: >> + pass >> + self.__dict__['_branchcachedirty'] = False >> + >> def known(self, nodes): >> nm = self.changelog.nodemap >> pc = self._phasecache >> _______________________________________________ >> Mercurial-devel mailing list >> Mercurial-devel@selenic.com >> http://selenic.com/mailman/listinfo/mercurial-devel
On Wed, Oct 15, 2014 at 2:57 PM, Mads Kiilerich <mads@kiilerich.com> wrote: > On 10/15/2014 08:32 PM, Augie Fackler wrote: >> >> On Wed, Oct 15, 2014 at 08:14:22PM +0200, Mads Kiilerich wrote: >>> >>> # HG changeset patch >>> # User Mads Kiilerich <madski@unity3d.com> >>> # Date 1413396806 -7200 >>> # Wed Oct 15 20:13:26 2014 +0200 >>> # Node ID bf7a0169677c0545a63e64690b0e49e50b376703 >>> # Parent 48c0b101a9de1fdbd638daa858da845cd05a6be7 >>> localrepo: persistent caching of branch names >>> >>> It is expensive to retrieve the branch name. Very expensive when creating >>> a >>> changectx and calling .branch() - slightly less when using >>> changelog.branchinfo(). >>> >>> Now, to really speed things up, cache the results on disk. For each repo >>> revision store the node hash and a reference to the branch name. To avoid >>> using >>> too much space, each branch name is only stored once. To make it 100% >>> stable >>> against repository mutations, always check the node hash before using the >>> cache content. >> >> So we only store one entry for each branch? Maybe we could do >> something more LRU-ish to have bounded storage but still get more >> performance benefit? > > > Eh. We only store the name of each branch once in a list. For each revision > we store the index of the corresponding branch name. Oh, I see. I think the format description in the changlog message could use some help then, because I *wildly* misunderstood what you are doing until this email. > > A hg log -r 'branch(stable)' has to retrieve the branch name of every single > revision in the repository. (That could be avoided by having clever indices > that are closely integrated with the revset query plans ... but that sounds > like a project for another year. This cache can help until we get there.) > > I don't think anything LRU-ish can help. > [snip] >>> + >>> + self._branches = [] >>> + self._branchrecs = array.array('c') # bytes of struct type >>> bcrecfmt >>> + self.__dict__['_branchcachedirty'] = True >> >> Why are you poking __dict__ instead of just assigning to the property? >> That at least merits a comment, but if I had to guess it more merits a >> cleanup of some logic. > > > I wish I knew. It is related to the filtering crazyness. There is a comment > elsewhere: > > if '_tagscache' in vars(self): > # can't use delattr on proxy > del self.__dict__['_tagscache'] > > Doing it the obvious way doesn't work. It is possible that some combination > of .filtered and .unfiltered could work. > > I could add a comment like that every time I have to use that hack, but I > guess it is or should be common knowledge that localrepo is special. Urgh. That's awful. > > >> >>> + reporecslen = len(self) * bcrecsize >>> + if len(data) >= bcheadsize: >>> + v, recsstart, recslen = struct.unpack_from(bcheadfmt, data) >>> + if v == bcversion and len(data) == recsstart + recslen: >>> + if recsstart: >>> + self._branches = \ >>> + data[bcheadsize:recsstart].split('\0') >>> + self._branchrecs.fromstring( >>> + buffer(data, recsstart, min(recslen, reporecslen))) >>> + self.__dict__['_branchcachedirty'] = recslen > >>> reporecslen >>> + else: >>> + self.ui.debug('branch cache file was invalid\n') >>> + >>> + if len(self._branchrecs) < reporecslen: >>> + self._branchrecs.extend( >>> + '\xff' * (reporecslen - len(self._branchrecs))) >>> + >>> + self._branchnamesindex = dict((b, r) >>> + for r, b in >>> enumerate(self._branches)) >>> + >>> + def branch(self, rev): >>> + """return branch name of rev, using and updating persistent >>> cache.""" >>> + if self._branchcachedirty is None: >> >> I can't say I'm a fan of the None/False/True three-state boolean thing >> going on here. Can we do better somehow? Perhaps by moving the >> branchcache into its own object rather than wedging it into >> repofilecache? > > > Doing it that way would add an extra indirection and we do not like classes. Says who? We've been growing more of them as it makes sense, and I think this is one of those cases where it makes a /ton/ of sense. > It would have to be non-intrusive when constructing localrepos but efficient > when using them. I can try to wrap it around and come up with something. something like class branchcache(object): def __init__(self, path): self._loaded = False self._path = path self._dirty = False def save(): # write and clear self._dirty def load(): # read and clear self._dirty def branch(node): if not self._loaded: elf._load() etc? > > /Mads > > >> >>> + self._branchcacheload() >>> + >>> + node = self.changelog.node(rev) >>> + cachenode, branchidx = struct.unpack_from(bcrecfmt, >>> self._branchrecs, >>> + rev * bcrecsize) >>> + if cachenode == node and branchidx < len(self._branches): >>> + return self._branches[branchidx] >>> + b, _close = self.changelog.branchinfo(rev) >>> + if b in self._branchnamesindex: >>> + branchidx = self._branchnamesindex[b] >>> + else: >>> + branchidx = len(self._branches) >>> + self._branches.append(b) >>> + self._branchnamesindex[b] = branchidx >>> + struct.pack_into(bcrecfmt, self._branchrecs, rev * bcrecsize, >>> + node, branchidx) >>> + self.__dict__['_branchcachedirty'] = True >>> + return b >>> + >>> + def _branchcachesave(self): >>> + """save branch cache if it is dirty""" >>> + if self._branchcachedirty: >>> + self.ui.debug('writing branch cache file\n') >>> + try: >>> + f = self.vfs.open(bcfilename, 'w', atomictemp=True) >>> + s = '\0'.join(self._branches) >>> + f.write(struct.pack(bcheadfmt, bcversion, >>> + bcheadsize + len(s), >>> len(self._branchrecs))) >>> + f.write(s) >>> + f.write(self._branchrecs) >>> + f.close() >>> + except IOError: >>> + pass >>> + self.__dict__['_branchcachedirty'] = False >>> + >>> def known(self, nodes): >>> nm = self.changelog.nodemap >>> pc = self._phasecache >>> _______________________________________________ >>> Mercurial-devel mailing list >>> Mercurial-devel@selenic.com >>> http://selenic.com/mailman/listinfo/mercurial-devel > >
On 10/15/2014 09:02 PM, Augie Fackler wrote: > On Wed, Oct 15, 2014 at 2:57 PM, Mads Kiilerich <mads@kiilerich.com> wrote: >> On 10/15/2014 08:32 PM, Augie Fackler wrote: >>> On Wed, Oct 15, 2014 at 08:14:22PM +0200, Mads Kiilerich wrote: >>>> I can't say I'm a fan of the None/False/True three-state boolean thing >>>> going on here. Can we do better somehow? Perhaps by moving the >>>> branchcache into its own object rather than wedging it into >>>> repofilecache? >> Doing it that way would add an extra indirection and we do not like classes. > Says who? We've been growing more of them as it makes sense, and I > think this is one of those cases where it makes a /ton/ of sense. http://mercurial.selenic.com/wiki/CodingStyle#Classes /Mads
Patch
diff --git a/mercurial/localrepo.py b/mercurial/localrepo.py --- a/mercurial/localrepo.py +++ b/mercurial/localrepo.py @@ -6,7 +6,7 @@ # GNU General Public License version 2 or any later version. from node import hex, nullid, short from i18n import _ -import urllib +import urllib, struct, array import peer, changegroup, subrepo, pushkey, obsolete, repoview import changelog, dirstate, filelog, manifest, context, bookmarks, phases import lock as lockmod @@ -21,6 +21,14 @@ import branchmap, pathutil propertycache = util.propertycache filecache = scmutil.filecache +# branch name caching +bcfilename = 'cache/branchnames' +bcversion = 2345164374 +bcheadfmt = '>LLL' +bcheadsize = struct.calcsize(bcheadfmt) +bcrecfmt = '>20sH' +bcrecsize = struct.calcsize(bcrecfmt) + class repofilecache(filecache): """All filecache usage on repo are done for logic that should be unfiltered """ @@ -179,6 +187,7 @@ class localrepository(object): openerreqs = set(('revlogv1', 'generaldelta')) requirements = ['revlogv1'] filtername = None + _branchcachedirty = None # a list of (ui, featureset) functions. # only functions defined in module of enabled extensions are invoked @@ -298,7 +307,7 @@ class localrepository(object): self.filteredrevcache = {} def close(self): - pass + self._branchcachesave() def _restrictcapabilities(self, caps): # bundle2 is not ready for prime time, drop it unless explicitly @@ -723,6 +732,74 @@ class localrepository(object): repo = (remote and remote.local()) and remote or self return repo[key].branch() + def _branchcacheload(self): + """Load cached branch values.""" + try: + data = self.vfs.open(bcfilename).read() + except IOError: + data = '' + + self._branches = [] + self._branchrecs = array.array('c') # bytes of struct type bcrecfmt + self.__dict__['_branchcachedirty'] = True + reporecslen = len(self) * bcrecsize + if len(data) >= bcheadsize: + v, recsstart, recslen = struct.unpack_from(bcheadfmt, data) + if v == bcversion and len(data) == recsstart + recslen: + if recsstart: + self._branches = \ + data[bcheadsize:recsstart].split('\0') + self._branchrecs.fromstring( + buffer(data, recsstart, min(recslen, reporecslen))) + self.__dict__['_branchcachedirty'] = recslen > reporecslen + else: + self.ui.debug('branch cache file was invalid\n') + + if len(self._branchrecs) < reporecslen: + self._branchrecs.extend( + '\xff' * (reporecslen - len(self._branchrecs))) + + self._branchnamesindex = dict((b, r) + for r, b in enumerate(self._branches)) + + def branch(self, rev): + """return branch name of rev, using and updating persistent cache.""" + if self._branchcachedirty is None: + self._branchcacheload() + + node = self.changelog.node(rev) + cachenode, branchidx = struct.unpack_from(bcrecfmt, self._branchrecs, + rev * bcrecsize) + if cachenode == node and branchidx < len(self._branches): + return self._branches[branchidx] + b, _close = self.changelog.branchinfo(rev) + if b in self._branchnamesindex: + branchidx = self._branchnamesindex[b] + else: + branchidx = len(self._branches) + self._branches.append(b) + self._branchnamesindex[b] = branchidx + struct.pack_into(bcrecfmt, self._branchrecs, rev * bcrecsize, + node, branchidx) + self.__dict__['_branchcachedirty'] = True + return b + + def _branchcachesave(self): + """save branch cache if it is dirty""" + if self._branchcachedirty: + self.ui.debug('writing branch cache file\n') + try: + f = self.vfs.open(bcfilename, 'w', atomictemp=True) + s = '\0'.join(self._branches) + f.write(struct.pack(bcheadfmt, bcversion, + bcheadsize + len(s), len(self._branchrecs))) + f.write(s) + f.write(self._branchrecs) + f.close() + except IOError: + pass + self.__dict__['_branchcachedirty'] = False + def known(self, nodes): nm = self.changelog.nodemap pc = self._phasecache