Patchwork [05,of,12] revlog: move the good delta heuristic in revlogutils.deltas

login
register
mail settings
Submitter Boris Feld
Date Aug. 18, 2018, 9:27 a.m.
Message ID <9c7224a86df8f316dbfb.1534584440@FB-lair>
Download mbox | patch
Permalink /patch/33870/
State Accepted
Headers show

Comments

Boris Feld - Aug. 18, 2018, 9:27 a.m.
# HG changeset patch
# User Boris Feld <boris.feld@octobus.net>
# Date 1534380638 -7200
#      Thu Aug 16 02:50:38 2018 +0200
# Node ID 9c7224a86df8f316dbfb006ba8e81fc3ce831303
# Parent  07af069da30de0c6ae0ce8229a0ddd7c22c89313
# EXP-Topic sparse-snapshot
# Available At https://bitbucket.org/octobus/mercurial-devel/
#              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r 9c7224a86df8
revlog: move the good delta heuristic in revlogutils.deltas

This is logic about building delta chains. We move it with related logic. The
only user of this logic moved in this module already.
Gregory Szorc - Aug. 24, 2018, 5:55 p.m.
On Sat, Aug 18, 2018 at 2:27 AM Boris Feld <boris.feld@octobus.net> wrote:

> # HG changeset patch
> # User Boris Feld <boris.feld@octobus.net>
> # Date 1534380638 -7200
> #      Thu Aug 16 02:50:38 2018 +0200
> # Node ID 9c7224a86df8f316dbfb006ba8e81fc3ce831303
> # Parent  07af069da30de0c6ae0ce8229a0ddd7c22c89313
> # EXP-Topic sparse-snapshot
> # Available At https://bitbucket.org/octobus/mercurial-devel/
> #              hg pull https://bitbucket.org/octobus/mercurial-devel/ -r
> 9c7224a86df8
> revlog: move the good delta heuristic in revlogutils.deltas
>
> This is logic about building delta chains. We move it with related logic.
> The
> only user of this logic moved in this module already.
>

I'm all for sending the code move patches piecemeal to make things easier
to review. But I think we should squash patches 3-5 on landing so code
history is preserved.


>
> diff --git a/mercurial/revlog.py b/mercurial/revlog.py
> --- a/mercurial/revlog.py
> +++ b/mercurial/revlog.py
> @@ -37,7 +37,6 @@ from .i18n import _
>  from .revlogutils.constants import (
>      FLAG_GENERALDELTA,
>      FLAG_INLINE_DATA,
> -    LIMIT_DELTA2TEXT,
>      REVIDX_DEFAULT_FLAGS,
>      REVIDX_ELLIPSIS,
>      REVIDX_EXTSTORED,
> @@ -1897,96 +1896,6 @@ class revlog(object):
>
>          return compressor.decompress(data)
>
> -    def _isgooddeltainfo(self, deltainfo, revinfo):
> -        """Returns True if the given delta is good. Good means that it is
> within
> -        the disk span, disk size, and chain length bounds that we know to
> be
> -        performant."""
> -        if deltainfo is None:
> -            return False
> -
> -        # - 'deltainfo.distance' is the distance from the base revision --
> -        #   bounding it limits the amount of I/O we need to do.
> -        # - 'deltainfo.compresseddeltalen' is the sum of the total size of
> -        #   deltas we need to apply -- bounding it limits the amount of
> CPU
> -        #   we consume.
> -
> -        if self._sparserevlog:
> -            # As sparse-read will be used, we can consider that the
> distance,
> -            # instead of being the span of the whole chunk,
> -            # is the span of the largest read chunk
> -            base = deltainfo.base
> -
> -            if base != nullrev:
> -                deltachain = self._deltachain(base)[0]
> -            else:
> -                deltachain = []
> -
> -            # search for the first non-snapshot revision
> -            for idx, r in enumerate(deltachain):
> -                if not self.issnapshot(r):
> -                    break
> -            deltachain = deltachain[idx:]
> -            chunks = deltautil.slicechunk(self, deltachain, deltainfo)
> -            all_span = [deltautil.segmentspan(self, revs, deltainfo)
> -                        for revs in chunks]
> -            distance = max(all_span)
> -        else:
> -            distance = deltainfo.distance
> -
> -        textlen = revinfo.textlen
> -        defaultmax = textlen * 4
> -        maxdist = self._maxdeltachainspan
> -        if not maxdist:
> -            maxdist = distance # ensure the conditional pass
> -        maxdist = max(maxdist, defaultmax)
> -        if self._sparserevlog and maxdist < self._srmingapsize:
> -            # In multiple place, we are ignoring irrelevant data range
> below a
> -            # certain size. Be also apply this tradeoff here and relax
> span
> -            # constraint for small enought content.
> -            maxdist = self._srmingapsize
> -
> -        # Bad delta from read span:
> -        #
> -        #   If the span of data read is larger than the maximum allowed.
> -        if maxdist < distance:
> -            return False
> -
> -        # Bad delta from new delta size:
> -        #
> -        #   If the delta size is larger than the target text, storing the
> -        #   delta will be inefficient.
> -        if textlen < deltainfo.deltalen:
> -            return False
> -
> -        # Bad delta from cumulated payload size:
> -        #
> -        #   If the sum of delta get larger than K * target text length.
> -        if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
> -            return False
> -
> -        # Bad delta from chain length:
> -        #
> -        #   If the number of delta in the chain gets too high.
> -        if self._maxchainlen and  self._maxchainlen < deltainfo.chainlen:
> -            return False
> -
> -        # bad delta from intermediate snapshot size limit
> -        #
> -        #   If an intermediate snapshot size is higher than the limit.
> The
> -        #   limit exist to prevent endless chain of intermediate delta to
> be
> -        #   created.
> -        if (deltainfo.snapshotdepth is not None and
> -                (textlen >> deltainfo.snapshotdepth) <
> deltainfo.deltalen):
> -            return False
> -
> -        # bad delta if new intermediate snapshot is larger than the
> previous
> -        # snapshot
> -        if (deltainfo.snapshotdepth
> -                and self.length(deltainfo.base) < deltainfo.deltalen):
> -            return False
> -
> -        return True
> -
>      def _addrevision(self, node, rawtext, transaction, link, p1, p2,
> flags,
>                       cachedelta, ifh, dfh, alwayscache=False,
>                       deltacomputer=None):
> diff --git a/mercurial/revlogutils/constants.py
> b/mercurial/revlogutils/constants.py
> --- a/mercurial/revlogutils/constants.py
> +++ b/mercurial/revlogutils/constants.py
> @@ -41,6 +41,3 @@ REVIDX_FLAGS_ORDER = [
>  REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
>  # bitmark for flags that could cause rawdata content change
>  REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED
> -
> -# maximum <delta-chain-data>/<revision-text-length> ratio
> -LIMIT_DELTA2TEXT = 2
> diff --git a/mercurial/revlogutils/deltas.py
> b/mercurial/revlogutils/deltas.py
> --- a/mercurial/revlogutils/deltas.py
> +++ b/mercurial/revlogutils/deltas.py
> @@ -19,7 +19,6 @@ from ..node import (
>  from ..i18n import _
>
>  from .constants import (
> -    LIMIT_DELTA2TEXT,
>      REVIDX_ISCENSORED,
>      REVIDX_RAWTEXT_CHANGING_FLAGS,
>  )
> @@ -36,6 +35,9 @@ from .. import (
>  RevlogError = error.RevlogError
>  CensoredNodeError = error.CensoredNodeError
>
> +# maximum <delta-chain-data>/<revision-text-length> ratio
> +LIMIT_DELTA2TEXT = 2
> +
>  class _testrevlog(object):
>      """minimalist fake revlog to use in doctests"""
>
> @@ -447,6 +449,97 @@ class _deltainfo(object):
>      compresseddeltalen = attr.ib()
>      snapshotdepth = attr.ib()
>
> +def isgooddeltainfo(revlog, deltainfo, revinfo):
> +    """Returns True if the given delta is good. Good means that it is
> within
> +    the disk span, disk size, and chain length bounds that we know to be
> +    performant."""
> +    if deltainfo is None:
> +        return False
> +
> +    # - 'deltainfo.distance' is the distance from the base revision --
> +    #   bounding it limits the amount of I/O we need to do.
> +    # - 'deltainfo.compresseddeltalen' is the sum of the total size of
> +    #   deltas we need to apply -- bounding it limits the amount of CPU
> +    #   we consume.
> +
> +    if revlog._sparserevlog:
> +        # As sparse-read will be used, we can consider that the distance,
> +        # instead of being the span of the whole chunk,
> +        # is the span of the largest read chunk
> +        base = deltainfo.base
> +
> +        if base != nullrev:
> +            deltachain = revlog._deltachain(base)[0]
> +        else:
> +            deltachain = []
> +
> +        # search for the first non-snapshot revision
> +        for idx, r in enumerate(deltachain):
> +            if not revlog.issnapshot(r):
> +                break
> +        deltachain = deltachain[idx:]
> +        chunks = slicechunk(revlog, deltachain, deltainfo)
> +        all_span = [segmentspan(revlog, revs, deltainfo)
> +                    for revs in chunks]
> +        distance = max(all_span)
> +    else:
> +        distance = deltainfo.distance
> +
> +    textlen = revinfo.textlen
> +    defaultmax = textlen * 4
> +    maxdist = revlog._maxdeltachainspan
> +    if not maxdist:
> +        maxdist = distance # ensure the conditional pass
> +    maxdist = max(maxdist, defaultmax)
> +    if revlog._sparserevlog and maxdist < revlog._srmingapsize:
> +        # In multiple place, we are ignoring irrelevant data range below a
> +        # certain size. Be also apply this tradeoff here and relax span
> +        # constraint for small enought content.
> +        maxdist = revlog._srmingapsize
> +
> +    # Bad delta from read span:
> +    #
> +    #   If the span of data read is larger than the maximum allowed.
> +    if maxdist < distance:
> +        return False
> +
> +    # Bad delta from new delta size:
> +    #
> +    #   If the delta size is larger than the target text, storing the
> +    #   delta will be inefficient.
> +    if textlen < deltainfo.deltalen:
> +        return False
> +
> +    # Bad delta from cumulated payload size:
> +    #
> +    #   If the sum of delta get larger than K * target text length.
> +    if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
> +        return False
> +
> +    # Bad delta from chain length:
> +    #
> +    #   If the number of delta in the chain gets too high.
> +    if (revlog._maxchainlen
> +            and revlog._maxchainlen < deltainfo.chainlen):
> +        return False
> +
> +    # bad delta from intermediate snapshot size limit
> +    #
> +    #   If an intermediate snapshot size is higher than the limit.  The
> +    #   limit exist to prevent endless chain of intermediate delta to be
> +    #   created.
> +    if (deltainfo.snapshotdepth is not None and
> +            (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
> +        return False
> +
> +    # bad delta if new intermediate snapshot is larger than the previous
> +    # snapshot
> +    if (deltainfo.snapshotdepth
> +            and revlog.length(deltainfo.base) < deltainfo.deltalen):
> +        return False
> +
> +    return True
> +
>  class deltacomputer(object):
>      def __init__(self, revlog):
>          self.revlog = revlog
> @@ -632,7 +725,7 @@ class deltacomputer(object):
>                  if revlog.flags(candidaterev) &
> REVIDX_RAWTEXT_CHANGING_FLAGS:
>                      continue
>                  candidatedelta = self._builddeltainfo(revinfo,
> candidaterev, fh)
> -                if revlog._isgooddeltainfo(candidatedelta, revinfo):
> +                if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
>                      nominateddeltas.append(candidatedelta)
>              if nominateddeltas:
>                  deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)
>

Patch

diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -37,7 +37,6 @@  from .i18n import _
 from .revlogutils.constants import (
     FLAG_GENERALDELTA,
     FLAG_INLINE_DATA,
-    LIMIT_DELTA2TEXT,
     REVIDX_DEFAULT_FLAGS,
     REVIDX_ELLIPSIS,
     REVIDX_EXTSTORED,
@@ -1897,96 +1896,6 @@  class revlog(object):
 
         return compressor.decompress(data)
 
-    def _isgooddeltainfo(self, deltainfo, revinfo):
-        """Returns True if the given delta is good. Good means that it is within
-        the disk span, disk size, and chain length bounds that we know to be
-        performant."""
-        if deltainfo is None:
-            return False
-
-        # - 'deltainfo.distance' is the distance from the base revision --
-        #   bounding it limits the amount of I/O we need to do.
-        # - 'deltainfo.compresseddeltalen' is the sum of the total size of
-        #   deltas we need to apply -- bounding it limits the amount of CPU
-        #   we consume.
-
-        if self._sparserevlog:
-            # As sparse-read will be used, we can consider that the distance,
-            # instead of being the span of the whole chunk,
-            # is the span of the largest read chunk
-            base = deltainfo.base
-
-            if base != nullrev:
-                deltachain = self._deltachain(base)[0]
-            else:
-                deltachain = []
-
-            # search for the first non-snapshot revision
-            for idx, r in enumerate(deltachain):
-                if not self.issnapshot(r):
-                    break
-            deltachain = deltachain[idx:]
-            chunks = deltautil.slicechunk(self, deltachain, deltainfo)
-            all_span = [deltautil.segmentspan(self, revs, deltainfo)
-                        for revs in chunks]
-            distance = max(all_span)
-        else:
-            distance = deltainfo.distance
-
-        textlen = revinfo.textlen
-        defaultmax = textlen * 4
-        maxdist = self._maxdeltachainspan
-        if not maxdist:
-            maxdist = distance # ensure the conditional pass
-        maxdist = max(maxdist, defaultmax)
-        if self._sparserevlog and maxdist < self._srmingapsize:
-            # In multiple place, we are ignoring irrelevant data range below a
-            # certain size. Be also apply this tradeoff here and relax span
-            # constraint for small enought content.
-            maxdist = self._srmingapsize
-
-        # Bad delta from read span:
-        #
-        #   If the span of data read is larger than the maximum allowed.
-        if maxdist < distance:
-            return False
-
-        # Bad delta from new delta size:
-        #
-        #   If the delta size is larger than the target text, storing the
-        #   delta will be inefficient.
-        if textlen < deltainfo.deltalen:
-            return False
-
-        # Bad delta from cumulated payload size:
-        #
-        #   If the sum of delta get larger than K * target text length.
-        if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
-            return False
-
-        # Bad delta from chain length:
-        #
-        #   If the number of delta in the chain gets too high.
-        if self._maxchainlen and  self._maxchainlen < deltainfo.chainlen:
-            return False
-
-        # bad delta from intermediate snapshot size limit
-        #
-        #   If an intermediate snapshot size is higher than the limit.  The
-        #   limit exist to prevent endless chain of intermediate delta to be
-        #   created.
-        if (deltainfo.snapshotdepth is not None and
-                (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
-            return False
-
-        # bad delta if new intermediate snapshot is larger than the previous
-        # snapshot
-        if (deltainfo.snapshotdepth
-                and self.length(deltainfo.base) < deltainfo.deltalen):
-            return False
-
-        return True
-
     def _addrevision(self, node, rawtext, transaction, link, p1, p2, flags,
                      cachedelta, ifh, dfh, alwayscache=False,
                      deltacomputer=None):
diff --git a/mercurial/revlogutils/constants.py b/mercurial/revlogutils/constants.py
--- a/mercurial/revlogutils/constants.py
+++ b/mercurial/revlogutils/constants.py
@@ -41,6 +41,3 @@  REVIDX_FLAGS_ORDER = [
 REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
 # bitmark for flags that could cause rawdata content change
 REVIDX_RAWTEXT_CHANGING_FLAGS = REVIDX_ISCENSORED | REVIDX_EXTSTORED
-
-# maximum <delta-chain-data>/<revision-text-length> ratio
-LIMIT_DELTA2TEXT = 2
diff --git a/mercurial/revlogutils/deltas.py b/mercurial/revlogutils/deltas.py
--- a/mercurial/revlogutils/deltas.py
+++ b/mercurial/revlogutils/deltas.py
@@ -19,7 +19,6 @@  from ..node import (
 from ..i18n import _
 
 from .constants import (
-    LIMIT_DELTA2TEXT,
     REVIDX_ISCENSORED,
     REVIDX_RAWTEXT_CHANGING_FLAGS,
 )
@@ -36,6 +35,9 @@  from .. import (
 RevlogError = error.RevlogError
 CensoredNodeError = error.CensoredNodeError
 
+# maximum <delta-chain-data>/<revision-text-length> ratio
+LIMIT_DELTA2TEXT = 2
+
 class _testrevlog(object):
     """minimalist fake revlog to use in doctests"""
 
@@ -447,6 +449,97 @@  class _deltainfo(object):
     compresseddeltalen = attr.ib()
     snapshotdepth = attr.ib()
 
+def isgooddeltainfo(revlog, deltainfo, revinfo):
+    """Returns True if the given delta is good. Good means that it is within
+    the disk span, disk size, and chain length bounds that we know to be
+    performant."""
+    if deltainfo is None:
+        return False
+
+    # - 'deltainfo.distance' is the distance from the base revision --
+    #   bounding it limits the amount of I/O we need to do.
+    # - 'deltainfo.compresseddeltalen' is the sum of the total size of
+    #   deltas we need to apply -- bounding it limits the amount of CPU
+    #   we consume.
+
+    if revlog._sparserevlog:
+        # As sparse-read will be used, we can consider that the distance,
+        # instead of being the span of the whole chunk,
+        # is the span of the largest read chunk
+        base = deltainfo.base
+
+        if base != nullrev:
+            deltachain = revlog._deltachain(base)[0]
+        else:
+            deltachain = []
+
+        # search for the first non-snapshot revision
+        for idx, r in enumerate(deltachain):
+            if not revlog.issnapshot(r):
+                break
+        deltachain = deltachain[idx:]
+        chunks = slicechunk(revlog, deltachain, deltainfo)
+        all_span = [segmentspan(revlog, revs, deltainfo)
+                    for revs in chunks]
+        distance = max(all_span)
+    else:
+        distance = deltainfo.distance
+
+    textlen = revinfo.textlen
+    defaultmax = textlen * 4
+    maxdist = revlog._maxdeltachainspan
+    if not maxdist:
+        maxdist = distance # ensure the conditional pass
+    maxdist = max(maxdist, defaultmax)
+    if revlog._sparserevlog and maxdist < revlog._srmingapsize:
+        # In multiple place, we are ignoring irrelevant data range below a
+        # certain size. Be also apply this tradeoff here and relax span
+        # constraint for small enought content.
+        maxdist = revlog._srmingapsize
+
+    # Bad delta from read span:
+    #
+    #   If the span of data read is larger than the maximum allowed.
+    if maxdist < distance:
+        return False
+
+    # Bad delta from new delta size:
+    #
+    #   If the delta size is larger than the target text, storing the
+    #   delta will be inefficient.
+    if textlen < deltainfo.deltalen:
+        return False
+
+    # Bad delta from cumulated payload size:
+    #
+    #   If the sum of delta get larger than K * target text length.
+    if textlen * LIMIT_DELTA2TEXT < deltainfo.compresseddeltalen:
+        return False
+
+    # Bad delta from chain length:
+    #
+    #   If the number of delta in the chain gets too high.
+    if (revlog._maxchainlen
+            and revlog._maxchainlen < deltainfo.chainlen):
+        return False
+
+    # bad delta from intermediate snapshot size limit
+    #
+    #   If an intermediate snapshot size is higher than the limit.  The
+    #   limit exist to prevent endless chain of intermediate delta to be
+    #   created.
+    if (deltainfo.snapshotdepth is not None and
+            (textlen >> deltainfo.snapshotdepth) < deltainfo.deltalen):
+        return False
+
+    # bad delta if new intermediate snapshot is larger than the previous
+    # snapshot
+    if (deltainfo.snapshotdepth
+            and revlog.length(deltainfo.base) < deltainfo.deltalen):
+        return False
+
+    return True
+
 class deltacomputer(object):
     def __init__(self, revlog):
         self.revlog = revlog
@@ -632,7 +725,7 @@  class deltacomputer(object):
                 if revlog.flags(candidaterev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
                     continue
                 candidatedelta = self._builddeltainfo(revinfo, candidaterev, fh)
-                if revlog._isgooddeltainfo(candidatedelta, revinfo):
+                if isgooddeltainfo(self.revlog, candidatedelta, revinfo):
                     nominateddeltas.append(candidatedelta)
             if nominateddeltas:
                 deltainfo = min(nominateddeltas, key=lambda x: x.deltalen)