Patchwork D10624: revlogv2: introduce a very basic docket file

login
register
mail settings
Submitter phabricator
Date May 3, 2021, 12:08 p.m.
Message ID <differential-rev-PHID-DREV-5yatf6vxn5cma7kfuvm4-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/48945/
State Superseded
Headers show

Comments

phabricator - May 3, 2021, 12:08 p.m.
marmoute created this revision.
Herald added a reviewer: indygreg.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  This is the first stone toward using a docket file in revlogv2. Right now the
  docket is very basic and only store the version number (which is -also- stored
  into the index file…) and the other files have fixed name. This new
  implementation break transactionally… but they are no test checking
  transactionally for revlogv2… So I take this as an opportunity to start small.
  They are no usage of revlogv2 outside of tests anyway.
  
  The docket keeps the `.i` naming used by previous version index to preserve a
  unique entry point. We could decide to use a different name and look it up
  first, or to fully rework this in a future "store" version. However that does
  not seems necessary right now.
  
  We will re-introduces transactionality (and associated testing…) in a later
  changesets.
  
  A long list of TODOs have been added to the relevant comment.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D10624

AFFECTED FILES
  mercurial/changelog.py
  mercurial/configitems.py
  mercurial/revlog.py
  mercurial/revlogutils/constants.py
  mercurial/revlogutils/docket.py
  mercurial/store.py

CHANGE DETAILS




To: marmoute, indygreg, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/mercurial/store.py b/mercurial/store.py
--- a/mercurial/store.py
+++ b/mercurial/store.py
@@ -389,7 +389,7 @@ 
 ]
 
 REVLOG_FILES_MAIN_EXT = (b'.i', b'i.tmpcensored')
-REVLOG_FILES_OTHER_EXT = (b'.d', b'.n', b'.nd', b'd.tmpcensored')
+REVLOG_FILES_OTHER_EXT = (b'.idx', b'.d', b'.n', b'.nd', b'd.tmpcensored')
 # files that are "volatile" and might change between listing and streaming
 #
 # note: the ".nd" file are nodemap data and won't "change" but they might be
@@ -397,7 +397,7 @@ 
 REVLOG_FILES_VOLATILE_EXT = (b'.n', b'.nd')
 
 # some exception to the above matching
-EXCLUDED = re.compile(b'.*undo\.[^/]+\.nd?$')
+EXCLUDED = re.compile(b'.*undo\.[^/]+\.(nd?|i)$')
 
 
 def is_revlog(f, kind, st):
@@ -407,7 +407,7 @@ 
 
 
 def revlog_type(f):
-    if f.endswith(REVLOG_FILES_MAIN_EXT):
+    if f.endswith(REVLOG_FILES_MAIN_EXT) and EXCLUDED.match(f) is None:
         return FILEFLAGS_REVLOG_MAIN
     elif f.endswith(REVLOG_FILES_OTHER_EXT) and EXCLUDED.match(f) is None:
         t = FILETYPE_FILELOG_OTHER
diff --git a/mercurial/revlogutils/docket.py b/mercurial/revlogutils/docket.py
new file mode 100644
--- /dev/null
+++ b/mercurial/revlogutils/docket.py
@@ -0,0 +1,80 @@ 
+# docket - code related to revlog "docket"
+#
+# Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+### Revlog docket file
+#
+# The revlog is stored on disk using multiple files:
+#
+# * a small docket file, containing metadata and pointeur,
+#
+# * an index file, containing fixed width information about revisions,
+#
+# * a data file, containing variable width data for these revisions,
+
+from __future__ import absolute_import
+
+import struct
+
+from . import (
+    constants,
+)
+
+# Docket format
+#
+# * 4 bytes: revlog version
+#          |   This is mandatory as docket must be compatible with the previous
+#          |   revlog index header.
+S_HEADER = struct.Struct(constants.INDEX_HEADER.format)
+
+
+class RevlogDocket(object):
+    """metadata associated with revlog"""
+
+    def __init__(self, revlog, version_header=None):
+        self._version_header = version_header
+        self._dirty = False
+        self._radix = revlog.radix
+        self._path = revlog._docket_file
+        self._opener = revlog.opener
+
+    def index_filepath(self):
+        """file path to the current index file associated to this docket"""
+        # very simplistic version at first
+        return b"%s.idx" % self._radix
+
+    def write(self, transaction):
+        """write the modification of disk if any
+
+        This make the new content visible to all process"""
+        if self._dirty:
+            transaction.addbackup(self._path, location=b'store')
+            with self._opener(self._path, mode=b'w', atomictemp=True) as f:
+                f.write(self._serialize())
+            self._dirty = False
+
+    def _serialize(self):
+        return S_HEADER.pack(self._version_header)
+
+
+def default_docket(revlog, version_header):
+    """given a revlog version a new docket object for the given revlog"""
+    if (version_header & 0xFFFF) != constants.REVLOGV2:
+        return None
+    docket = RevlogDocket(revlog, version_header=version_header)
+    docket._dirty = True
+    return docket
+
+
+def parse_docket(revlog, data):
+    """given some docket data return a docket object for the given revlog"""
+    header = S_HEADER.unpack(data[: S_HEADER.size])
+    (version_header,) = header
+    docket = RevlogDocket(
+        revlog,
+        version_header=version_header,
+    )
+    return docket
diff --git a/mercurial/revlogutils/constants.py b/mercurial/revlogutils/constants.py
--- a/mercurial/revlogutils/constants.py
+++ b/mercurial/revlogutils/constants.py
@@ -133,20 +133,22 @@ 
         b'inline': _no,
         b'generaldelta': _no,
         b'sidedata': False,
+        b'docket': False,
     },
     REVLOGV1: {
         b'inline': _from_flag(FLAG_INLINE_DATA),
         b'generaldelta': _from_flag(FLAG_GENERALDELTA),
         b'sidedata': False,
+        b'docket': False,
     },
     REVLOGV2: {
-        # There is a bug in the transaction handling when going from an
-        # inline revlog to a separate index and data file. Turn it off until
-        # it's fixed, since v2 revlogs sometimes get rewritten on exchange.
-        # See issue6485
+        # The point of inline-revlog is to reduce the number of files used in
+        # the store. Using a docket defeat this purpose. So we needs other
+        # means to reduce the number of files for revlogv2.
         b'inline': _no,
         b'generaldelta': _yes,
         b'sidedata': True,
+        b'docket': True,
     },
 }
 
diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -75,6 +75,7 @@ 
 )
 from .revlogutils import (
     deltas as deltautil,
+    docket as docketutil,
     flagutil,
     nodemap as nodemaputil,
     revlogv0,
@@ -317,6 +318,7 @@ 
 
         self.radix = radix
 
+        self._docket_file = None
         self._indexfile = None
         self._datafile = None
         self._nodemap_file = None
@@ -344,6 +346,7 @@ 
         self._maxchainlen = None
         self._deltabothparents = True
         self.index = None
+        self._docket = None
         self._nodemap_docket = None
         # Mapping of partial identifiers to full nodes.
         self._pcache = {}
@@ -505,8 +508,23 @@ 
         self._generaldelta = features[b'generaldelta'](self._format_flags)
         self.hassidedata = features[b'sidedata']
 
-        index_data = entry_data
-        self._indexfile = entry_point
+        if not features[b'docket']:
+            self._indexfile = entry_point
+            index_data = entry_data
+        else:
+            self._docket_file = entry_point
+            if self._initempty:
+                self._docket = docketutil.default_docket(self, header)
+            else:
+                self._docket = docketutil.parse_docket(self, entry_data)
+            self._indexfile = self._docket.index_filepath()
+            index_data = self._get_data(self._indexfile, mmapindexthreshold)
+            self._inline = False
+            # generaldelta implied by version 2 revlogs.
+            self._generaldelta = True
+            # the logic for persistent nodemap will be dealt with within the
+            # main docket, so disable it for now.
+            self._nodemap_file = None
 
         if self.postfix is None or self.postfix == b'a':
             self._datafile = b'%s.d' % self.radix
@@ -2053,6 +2071,8 @@ 
                     self._writinghandles = (ifh, dfh)
                     try:
                         yield
+                        if self._docket is not None:
+                            self._docket.write(transaction)
                     finally:
                         self._writinghandles = None
                 finally:
@@ -3126,9 +3146,7 @@ 
     def rewrite_sidedata(self, transaction, helpers, startrev, endrev):
         if not self.hassidedata:
             return
-        # inline are not yet supported because they suffer from an issue when
-        # rewriting them (since it's not an append-only operation).
-        # See issue6485.
+        # revlog formats with sidedata support does not support inline
         assert not self._inline
         if not helpers[1] and not helpers[2]:
             # Nothing to generate or remove
diff --git a/mercurial/configitems.py b/mercurial/configitems.py
--- a/mercurial/configitems.py
+++ b/mercurial/configitems.py
@@ -1145,6 +1145,14 @@ 
 )
 # "out of experimental" todo list.
 #
+# * stop storing version information in the index (it is already in the docket)
+# * properly hide uncommitted content to other process
+# * expose transaction content hooks during pre-commit validation
+# * include management of a persistent nodemap in the main docket
+# * enforce a "no-truncate" policy for mmap safety
+#      - for censoring operation
+#      - for stripping operation
+#      - for rollback operation
 # * to grow a docket file to at least store the last offset of the data
 #   file when rewriting sidedata.
 # * need a way of dealing with garbage data if we allow rewriting
@@ -1153,6 +1161,7 @@ 
 #   keeping references to the affected revlogs, especially memory-wise when
 #   rewriting sidedata.
 # * Also... compress the sidedata? (this should be coming very soon)
+# * introduce a proper solution to reduce the number of filelog related files.
 coreconfigitem(
     b'experimental',
     b'revlogv2',
diff --git a/mercurial/changelog.py b/mercurial/changelog.py
--- a/mercurial/changelog.py
+++ b/mercurial/changelog.py
@@ -445,6 +445,8 @@ 
 
     def delayupdate(self, tr):
         """delay visibility of index updates to other readers"""
+        if self._docket is not None:
+            return
 
         if not self._delayed:
             if len(self) == 0: