Submitter | Boris Feld |
---|---|
Date | July 9, 2017, 5:55 p.m. |
Message ID | <7c41d1484f53c53d5161.1499622917@FB> |
Download | mbox | patch |
Permalink | /patch/22166/ |
State | Changes Requested |
Headers | show |
Comments
On Sun, Jul 09, 2017 at 07:55:17PM +0200, Boris Feld wrote: > # HG changeset patch > # User Boris Feld <boris.feld@octobus.net> > # Date 1497674383 -7200 > # Sat Jun 17 06:39:43 2017 +0200 > # Node ID 7c41d1484f53c53d516153d19a90c331c3f87c16 > # Parent e73d330f6fdb0cf1635920f4302b61b321ec58e2 > # EXP-Topic obs-cache > obsstore: add a method to incrementally retrieve obsmarkers This one and its follow-up both look interesting, but I don't know if we have any imminent callers? > > Parsing the full obsstore is slow, so cache that depends on obsstore content > wants a way to know if the obsstore changed, and it this change was append > only. > > For this purpose we introduce an official cache-key for the obsstore. This > cache-key work in a way similar to the '(tiprev, tipnode)' pair used for the > changelog. We use the size of the obsstore file and the hash of its tail. That > way, we can check if the obsstore grew and if the content we knew is still > present in the obsstore. > > The cachekey comes with a method that only returns the obsmarkers that changed > between the last seen "cache-key" value and the current state of the > repository. That method is race free and should be used by caches. See > documentation for details. > > note: we are reading the full obsstore file from disk as part of the key > validation process. This could be avoided but we keep the implementation > simple for now. Once it is in place and running we can iterate to make it > better. > > diff -r e73d330f6fdb -r 7c41d1484f53 mercurial/obsolete.py > --- a/mercurial/obsolete.py Sun Jun 04 10:02:09 2017 -0700 > +++ b/mercurial/obsolete.py Sat Jun 17 06:39:43 2017 +0200 > @@ -70,6 +70,7 @@ > from __future__ import absolute_import > > import errno > +import hashlib > import struct > > from .i18n import _ > @@ -513,6 +514,10 @@ > # parents: (tuple of nodeid) or None, parents of precursors > # None is used when no data has been recorded > > + # how much data to read at the end of file, > + # 1024 byte should be about 10 markers in average > + _obskeyspan = 1024 > + > def __init__(self, svfs, defaultformat=_fm1version, readonly=False): > # caches for various obsolescence related cache > self.caches = {} > @@ -540,6 +545,72 @@ > > __bool__ = __nonzero__ > > + def getmarkerssince(self, previouscontentkey): > + """retrieve all new markers (since "contentkey") + updated content key > + > + This function is to be used by cache depending on obsstore content. It > + make sure cache can incrementally update themselves without fear > + obsstore stripping or race condition. If the content key is invalid > + (some obsstore content got stripped), all markers in the obsstore will > + be returned. > + > + return: (reset, obsmarkers, contentkey) > + > + :reset: boolean, True if previouscontentkey was invalided. Previously > + stored data are invalid and should be discarded. Full markers > + content is return in this case. > + > + :obsmarkers: the list of new obsmarkers. > + > + :contentkey: a key matching the content of 'previouscontentkey' + > + 'obsmarkers'. > + > + The content key is a pair of: > + > + (obsstore size, hash of last N bytes of the obsstore) > + > + It must be kept around by cache and provided again for the next > + incremental read of new obsmarkers. > + """ > + # XXX This function could avoid loading the whole data from disk (and > + # only read new markers). It currently use 'self._data' to keep the > + # code simpler. > + keysize, keyhash = previouscontentkey > + fulldata = self._data > + > + # check the existing cache key > + reset = False > + if len(fulldata) < keysize: # less data than expected, this is a strip > + reset = True > + else: > + if keysize == 0: # no obsstore > + actualhash = node.nullid > + else: > + first = max(0, keysize - self._obskeyspan) > + keydata = fulldata[first:keysize] > + actualhash = hashlib.sha1(keydata).digest() > + reset = actualhash != keyhash > + newsize = len(fulldata) > + > + # read the new data > + if reset: > + keysize = None # read all data > + elif keysize == newsize: > + # no update since last change, exist early > + return False, [], previouscontentkey > + if newsize: > + start = max(0, newsize - self._obskeyspan) > + newhash = hashlib.sha1(fulldata[start:newsize]).digest() > + __, obsmarkers = _readmarkers(fulldata, keysize, newsize) > + else: > + # obsstore is empty, use a generic hash and skip reading markers. > + newhash == node.nullid > + obsmarkers = [] > + > + # for now and for the sake of simplicity make sure obsmarkers is a list > + obsmarkers = list(obsmarkers) > + return reset, obsmarkers, (newsize, newhash) > + > @property > def readonly(self): > """True if marker creation is disabled > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@mercurial-scm.org > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Patch
diff -r e73d330f6fdb -r 7c41d1484f53 mercurial/obsolete.py --- a/mercurial/obsolete.py Sun Jun 04 10:02:09 2017 -0700 +++ b/mercurial/obsolete.py Sat Jun 17 06:39:43 2017 +0200 @@ -70,6 +70,7 @@ from __future__ import absolute_import import errno +import hashlib import struct from .i18n import _ @@ -513,6 +514,10 @@ # parents: (tuple of nodeid) or None, parents of precursors # None is used when no data has been recorded + # how much data to read at the end of file, + # 1024 byte should be about 10 markers in average + _obskeyspan = 1024 + def __init__(self, svfs, defaultformat=_fm1version, readonly=False): # caches for various obsolescence related cache self.caches = {} @@ -540,6 +545,72 @@ __bool__ = __nonzero__ + def getmarkerssince(self, previouscontentkey): + """retrieve all new markers (since "contentkey") + updated content key + + This function is to be used by cache depending on obsstore content. It + make sure cache can incrementally update themselves without fear + obsstore stripping or race condition. If the content key is invalid + (some obsstore content got stripped), all markers in the obsstore will + be returned. + + return: (reset, obsmarkers, contentkey) + + :reset: boolean, True if previouscontentkey was invalided. Previously + stored data are invalid and should be discarded. Full markers + content is return in this case. + + :obsmarkers: the list of new obsmarkers. + + :contentkey: a key matching the content of 'previouscontentkey' + + 'obsmarkers'. + + The content key is a pair of: + + (obsstore size, hash of last N bytes of the obsstore) + + It must be kept around by cache and provided again for the next + incremental read of new obsmarkers. + """ + # XXX This function could avoid loading the whole data from disk (and + # only read new markers). It currently use 'self._data' to keep the + # code simpler. + keysize, keyhash = previouscontentkey + fulldata = self._data + + # check the existing cache key + reset = False + if len(fulldata) < keysize: # less data than expected, this is a strip + reset = True + else: + if keysize == 0: # no obsstore + actualhash = node.nullid + else: + first = max(0, keysize - self._obskeyspan) + keydata = fulldata[first:keysize] + actualhash = hashlib.sha1(keydata).digest() + reset = actualhash != keyhash + newsize = len(fulldata) + + # read the new data + if reset: + keysize = None # read all data + elif keysize == newsize: + # no update since last change, exist early + return False, [], previouscontentkey + if newsize: + start = max(0, newsize - self._obskeyspan) + newhash = hashlib.sha1(fulldata[start:newsize]).digest() + __, obsmarkers = _readmarkers(fulldata, keysize, newsize) + else: + # obsstore is empty, use a generic hash and skip reading markers. + newhash == node.nullid + obsmarkers = [] + + # for now and for the sake of simplicity make sure obsmarkers is a list + obsmarkers = list(obsmarkers) + return reset, obsmarkers, (newsize, newhash) + @property def readonly(self): """True if marker creation is disabled