Submitter | Boris Feld |
---|---|
Date | Aug. 18, 2018, 9:27 a.m. |
Message ID | <9c7224a86df8f316dbfb.1534584440@FB-lair> |
Download | mbox | patch |
Permalink | /patch/33870/ |
State | Accepted |
Headers | show |
Comments
On Sat, Aug 18, 2018 at 2:27 AM Boris Feld <boris.feld@octobus.net> wrote: > # HG changeset patch > # User Boris Feld <boris.feld@octobus.net> > # Date 1534380638 -7200 > # Thu Aug 16 02:50:38 2018 +0200 > # Node ID 9c7224a86df8f316dbfb006ba8e81fc3ce831303 > # Parent 07af069da30de0c6ae0ce8229a0ddd7c22c89313 > # EXP-Topic sparse-snapshot > # Available At https://bitbucket.org/octobus/mercurial-devel/ > # hg pull https://bitbucket.org/octobus/mercurial-devel/ -r > 9c7224a86df8 > revlog: move the good delta heuristic in revlogutils.deltas > > This is logic about building delta chains. We move it with related logic. > The > only user of this logic moved in this module already. > I'm all for sending the code move patches piecemeal to make things easier to review. But I think we should squash patches 3-5 on landing so code history is preserved. > > diff --git a/mercurial/revlog.py b/mercurial/revlog.py > --- a/mercurial/revlog.py > +++ b/mercurial/revlog.py > @@ -37,7 +37,6 @@ from .i18n import _ > from .revlogutils.constants import ( > FLAG_GENERALDELTA, > FLAG_INLINE_DATA, > - LIMIT_DELTA2TEXT, > REVIDX_DEFAULT_FLAGS, > REVIDX_ELLIPSIS, > REVIDX_EXTSTORED, > @@ -1897,96 +1896,6 @@ class revlog(object): > > return compressor.decompress(data) > > - def _isgooddeltainfo(self, deltainfo, revinfo): > - """Returns True if the given delta is good. Good means that it is > within > - the disk span, disk size, and chain length bounds that we know to > be > - performant.""" > - if deltainfo is None: > - return False > - > - # - 'deltainfo.distance' is the distance from the base revision -- > - # bounding it limits the amount of I/O we need to do. > - # - 'deltainfo.compresseddeltalen' is the sum of the total size of > - # deltas we need to apply -- bounding it limits the amount of > CPU > - # we consume. > - > - if self._sparserevlog: > - # As sparse-read will be used, we can consider that the > distance, > - # instead of being the span of the whole chunk, > - # is the span of the largest read chunk > - base = deltainfo.base > - > - if base != nullrev: > - deltachain = self._deltachain(base)[0] > - else: > - deltachain = [] > - > - # search for the first non-snapshot revision > - for idx, r in enumerate(deltachain): > - if not self.issnapshot(r): > - break > - deltachain = deltachain[idx:] > - chunks = deltautil.slicechunk(self, deltachain, deltainfo) > - all_span = [deltautil.segmentspan(self, revs, deltainfo) > - for revs in chunks] > - distance = max(all_span) > - else: > - distance = deltainfo.distance > - > - textlen = revinfo.textlen > - defaultmax = textlen * 4 > - maxdist = self._maxdeltachainspan > - if not maxdist: > - maxdist = distance # ensure the conditional pass > - maxdist = max(maxdist, defaultmax) > - if self._sparserevlog and maxdist < self._srmingapsize: > - # In multiple place, we are ignoring irrelevant data range > below a > - # certain size. Be also apply this tradeoff here and relax > span > - # constraint for small enought content. > - maxdist = self._srmingapsize > - > - # Bad delta from read span: > - # > - # If the span of data read is larger than the maximum allowed. > - if maxdist < distance: > - return False > - > - # Bad delta from new delta size: > - # > - # If the delta size is larger than the target text, storing the > - # delta will be inefficient. > - if textlen < deltainfo.deltalen: > - return False > - > - # Bad delta from cumulated payload size: > - # > - # If the sum of delta get larger than K * target text length. > - if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen: > - return False > - > - # Bad delta from chain length: > - # > - # If the number of delta in the chain gets too high. > - if self._maxchainlen and self._maxchainlen < deltainfo.chainlen: > - return False > - > - # bad delta from intermediate snapshot size limit > - # > - # If an intermediate snapshot size is higher than the limit. > The > - # limit exist to prevent endless chain of intermediate delta to > be > - # created. > - if (deltainfo.snapshotdepth is not None and > - (textlen >> deltainfo.snapshotdepth) < > deltainfo.deltalen): > - return False > - > - # bad delta if new intermediate snapshot is larger than the > previous > - # snapshot > - if (deltainfo.snapshotdepth > - and self.length(deltainfo.base) < deltainfo.deltalen): > - return False > - > - return True > - > def _addrevision(self, node, rawtext, transaction, link, p1, p2, > flags, > cachedelta, ifh, dfh, alwayscache=False, > deltacomputer=None): > diff --git a/mercurial/revlogutils/constants.py > b/mercurial/revlogutils/constants.py > --- a/mercurial/revlogutils/constants.py > +++ b/mercurial/revlogutils/constants.py > @@ -41,6 +41,3 @@ REVIDX_FLAGS_ORDER = [ > REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER) > # bitmark for flags that could cause rawdata content change > REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED > - > -# maximum <delta-chain-data>/<revision-text-length> ratio > -LIMIT_DELTA2TEXT = 2 > diff --git a/mercurial/revlogutils/deltas.py > b/mercurial/revlogutils/deltas.py > --- a/mercurial/revlogutils/deltas.py > +++ b/mercurial/revlogutils/deltas.py > @@ -19,7 +19,6 @@ from ..node import ( > from ..i18n import _ > > from .constants import ( > - LIMIT_DELTA2TEXT, > REVIDX_ISCENSORED, > REVIDX_RAWTEXT_CHANGING_FLAGS, > ) > @@ -36,6 +35,9 @@ from .. import ( > RevlogError = error.RevlogError > CensoredNodeError = error.CensoredNodeError > > +# maximum <delta-chain-data>/<revision-text-length> ratio > +LIMIT_DELTA2TEXT = 2 > + > class _testrevlog(object): > """minimalist fake revlog to use in doctests""" > > @@ -447,6 +449,97 @@ class _deltainfo(object): > compresseddeltalen = attr.ib() > snapshotdepth = attr.ib() > > +def isgooddeltainfo(revlog, deltainfo, revinfo): > + """Returns True if the given delta is good. Good means that it is > within > + the disk span, disk size, and chain length bounds that we know to be > + performant.""" > + if deltainfo is None: > + return False > + > + # - 'deltainfo.distance' is the distance from the base revision -- > + # bounding it limits the amount of I/O we need to do. > + # - 'deltainfo.compresseddeltalen' is the sum of the total size of > + # deltas we need to apply -- bounding it limits the amount of CPU > + # we consume. > + > + if revlog._sparserevlog: > + # As sparse-read will be used, we can consider that the distance, > + # instead of being the span of the whole chunk, > + # is the span of the largest read chunk > + base = deltainfo.base > + > + if base != nullrev: > + deltachain = revlog._deltachain(base)[0] > + else: > + deltachain = [] > + > + # search for the first non-snapshot revision > + for idx, r in enumerate(deltachain): > + if not revlog.issnapshot(r): > + break > + deltachain = deltachain[idx:] > + chunks = slicechunk(revlog, deltachain, deltainfo) > + all_span = [segmentspan(revlog, revs, deltainfo) > + for revs in chunks] > + distance = max(all_span) > + else: > + distance = deltainfo.distance > + > + textlen = revinfo.textlen > + defaultmax = textlen * 4 > + maxdist = revlog._maxdeltachainspan > + if not maxdist: > + maxdist = distance # ensure the conditional pass > + maxdist = max(maxdist, defaultmax) > + if revlog._sparserevlog and maxdist < revlog._srmingapsize: > + # In multiple place, we are ignoring irrelevant data range below a > + # certain size. Be also apply this tradeoff here and relax span > + # constraint for small enought content. > + maxdist = revlog._srmingapsize > + > + # Bad delta from read span: > + # > + # If the span of data read is larger than the maximum allowed. > + if maxdist < distance: > + return False > + > + # Bad delta from new delta size: > + # > + # If the delta size is larger than the target text, storing the > + # delta will be inefficient. > + if textlen < deltainfo.deltalen: > + return False > + > + # Bad delta from cumulated payload size: > + # > + # If the sum of delta get larger than K * target text length. > + if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen: > + return False > + > + # Bad delta from chain length: > + # > + # If the number of delta in the chain gets too high. > + if (revlog._maxchainlen > + and revlog._maxchainlen < deltainfo.chainlen): > + return False > + > + # bad delta from intermediate snapshot size limit > + # > + # If an intermediate snapshot size is higher than the limit. The > + # limit exist to prevent endless chain of intermediate delta to be > + # created. > + if (deltainfo.snapshotdepth is not None and > + (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen): > + return False > + > + # bad delta if new intermediate snapshot is larger than the previous > + # snapshot > + if (deltainfo.snapshotdepth > + and revlog.length(deltainfo.base) < deltainfo.deltalen): > + return False > + > + return True > + > class deltacomputer(object): > def __init__(self, revlog): > self.revlog = revlog > @@ -632,7 +725,7 @@ class deltacomputer(object): > if revlog.flags(candidaterev) & > REVIDX_RAWTEXT_CHANGING_FLAGS: > continue > candidatedelta = self._builddeltainfo(revinfo, > candidaterev, fh) > - if revlog._isgooddeltainfo(candidatedelta, revinfo): > + if isgooddeltainfo(self.revlog, candidatedelta, revinfo): > nominateddeltas.append(candidatedelta) > if nominateddeltas: > deltainfo = min(nominateddeltas, key=lambda x: x.deltalen) >
Patch
diff --git a/mercurial/revlog.py b/mercurial/revlog.py --- a/mercurial/revlog.py +++ b/mercurial/revlog.py @@ -37,7 +37,6 @@ from .i18n import _ from .revlogutils.constants import ( FLAG_GENERALDELTA, FLAG_INLINE_DATA, - LIMIT_DELTA2TEXT, REVIDX_DEFAULT_FLAGS, REVIDX_ELLIPSIS, REVIDX_EXTSTORED, @@ -1897,96 +1896,6 @@ class revlog(object): return compressor.decompress(data) - def _isgooddeltainfo(self, deltainfo, revinfo): - """Returns True if the given delta is good. Good means that it is within - the disk span, disk size, and chain length bounds that we know to be - performant.""" - if deltainfo is None: - return False - - # - 'deltainfo.distance' is the distance from the base revision -- - # bounding it limits the amount of I/O we need to do. - # - 'deltainfo.compresseddeltalen' is the sum of the total size of - # deltas we need to apply -- bounding it limits the amount of CPU - # we consume. - - if self._sparserevlog: - # As sparse-read will be used, we can consider that the distance, - # instead of being the span of the whole chunk, - # is the span of the largest read chunk - base = deltainfo.base - - if base != nullrev: - deltachain = self._deltachain(base)[0] - else: - deltachain = [] - - # search for the first non-snapshot revision - for idx, r in enumerate(deltachain): - if not self.issnapshot(r): - break - deltachain = deltachain[idx:] - chunks = deltautil.slicechunk(self, deltachain, deltainfo) - all_span = [deltautil.segmentspan(self, revs, deltainfo) - for revs in chunks] - distance = max(all_span) - else: - distance = deltainfo.distance - - textlen = revinfo.textlen - defaultmax = textlen * 4 - maxdist = self._maxdeltachainspan - if not maxdist: - maxdist = distance # ensure the conditional pass - maxdist = max(maxdist, defaultmax) - if self._sparserevlog and maxdist < self._srmingapsize: - # In multiple place, we are ignoring irrelevant data range below a - # certain size. Be also apply this tradeoff here and relax span - # constraint for small enought content. - maxdist = self._srmingapsize - - # Bad delta from read span: - # - # If the span of data read is larger than the maximum allowed. - if maxdist < distance: - return False - - # Bad delta from new delta size: - # - # If the delta size is larger than the target text, storing the - # delta will be inefficient. - if textlen < deltainfo.deltalen: - return False - - # Bad delta from cumulated payload size: - # - # If the sum of delta get larger than K * target text length. - if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen: - return False - - # Bad delta from chain length: - # - # If the number of delta in the chain gets too high. - if self._maxchainlen and self._maxchainlen < deltainfo.chainlen: - return False - - # bad delta from intermediate snapshot size limit - # - # If an intermediate snapshot size is higher than the limit. The - # limit exist to prevent endless chain of intermediate delta to be - # created. - if (deltainfo.snapshotdepth is not None and - (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen): - return False - - # bad delta if new intermediate snapshot is larger than the previous - # snapshot - if (deltainfo.snapshotdepth - and self.length(deltainfo.base) < deltainfo.deltalen): - return False - - return True - def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags, cachedelta, ifh, dfh, alwayscache=False, deltacomputer=None): diff --git a/mercurial/revlogutils/constants.py b/mercurial/revlogutils/constants.py --- a/mercurial/revlogutils/constants.py +++ b/mercurial/revlogutils/constants.py @@ -41,6 +41,3 @@ REVIDX_FLAGS_ORDER = [ REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER) # bitmark for flags that could cause rawdata content change REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED - -# maximum <delta-chain-data>/<revision-text-length> ratio -LIMIT_DELTA2TEXT = 2 diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py --- a/mercurial/revlogutils/deltas.py +++ b/mercurial/revlogutils/deltas.py @@ -19,7 +19,6 @@ from ..node import ( from ..i18n import _ from .constants import ( - LIMIT_DELTA2TEXT, REVIDX_ISCENSORED, REVIDX_RAWTEXT_CHANGING_FLAGS, ) @@ -36,6 +35,9 @@ from .. import ( RevlogError = error.RevlogError CensoredNodeError = error.CensoredNodeError +# maximum <delta-chain-data>/<revision-text-length> ratio +LIMIT_DELTA2TEXT = 2 + class _testrevlog(object): """minimalist fake revlog to use in doctests""" @@ -447,6 +449,97 @@ class _deltainfo(object): compresseddeltalen = attr.ib() snapshotdepth = attr.ib() +def isgooddeltainfo(revlog, deltainfo, revinfo): + """Returns True if the given delta is good. Good means that it is within + the disk span, disk size, and chain length bounds that we know to be + performant.""" + if deltainfo is None: + return False + + # - 'deltainfo.distance' is the distance from the base revision -- + # bounding it limits the amount of I/O we need to do. + # - 'deltainfo.compresseddeltalen' is the sum of the total size of + # deltas we need to apply -- bounding it limits the amount of CPU + # we consume. + + if revlog._sparserevlog: + # As sparse-read will be used, we can consider that the distance, + # instead of being the span of the whole chunk, + # is the span of the largest read chunk + base = deltainfo.base + + if base != nullrev: + deltachain = revlog._deltachain(base)[0] + else: + deltachain = [] + + # search for the first non-snapshot revision + for idx, r in enumerate(deltachain): + if not revlog.issnapshot(r): + break + deltachain = deltachain[idx:] + chunks = slicechunk(revlog, deltachain, deltainfo) + all_span = [segmentspan(revlog, revs, deltainfo) + for revs in chunks] + distance = max(all_span) + else: + distance = deltainfo.distance + + textlen = revinfo.textlen + defaultmax = textlen * 4 + maxdist = revlog._maxdeltachainspan + if not maxdist: + maxdist = distance # ensure the conditional pass + maxdist = max(maxdist, defaultmax) + if revlog._sparserevlog and maxdist < revlog._srmingapsize: + # In multiple place, we are ignoring irrelevant data range below a + # certain size. Be also apply this tradeoff here and relax span + # constraint for small enought content. + maxdist = revlog._srmingapsize + + # Bad delta from read span: + # + # If the span of data read is larger than the maximum allowed. + if maxdist < distance: + return False + + # Bad delta from new delta size: + # + # If the delta size is larger than the target text, storing the + # delta will be inefficient. + if textlen < deltainfo.deltalen: + return False + + # Bad delta from cumulated payload size: + # + # If the sum of delta get larger than K * target text length. + if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen: + return False + + # Bad delta from chain length: + # + # If the number of delta in the chain gets too high. + if (revlog._maxchainlen + and revlog._maxchainlen < deltainfo.chainlen): + return False + + # bad delta from intermediate snapshot size limit + # + # If an intermediate snapshot size is higher than the limit. The + # limit exist to prevent endless chain of intermediate delta to be + # created. + if (deltainfo.snapshotdepth is not None and + (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen): + return False + + # bad delta if new intermediate snapshot is larger than the previous + # snapshot + if (deltainfo.snapshotdepth + and revlog.length(deltainfo.base) < deltainfo.deltalen): + return False + + return True + class deltacomputer(object): def __init__(self, revlog): self.revlog = revlog @@ -632,7 +725,7 @@ class deltacomputer(object): if revlog.flags(candidaterev) & REVIDX_RAWTEXT_CHANGING_FLAGS: continue candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh) - if revlog._isgooddeltainfo(candidatedelta, revinfo): + if isgooddeltainfo(self.revlog, candidatedelta, revinfo): nominateddeltas.append(candidatedelta) if nominateddeltas: deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)