Patchwork D10632: revlogv2: also keep track for the size of the "data" file

login
register
mail settings
Submitter phabricator
Date May 3, 2021, 12:08 p.m.
Message ID <differential-rev-PHID-DREV-dgr3v35opj55ts45csqe-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/48951/
State New
Headers show

Comments

phabricator - May 3, 2021, 12:08 p.m.
marmoute created this revision.
Herald added a reviewer: indygreg.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  This is useful to make sure we always start writing at the right location,
  without effort.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D10632

AFFECTED FILES
  mercurial/configitems.py
  mercurial/revlog.py
  mercurial/revlogutils/docket.py

CHANGE DETAILS




To: marmoute, indygreg, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/mercurial/revlogutils/docket.py b/mercurial/revlogutils/docket.py
--- a/mercurial/revlogutils/docket.py
+++ b/mercurial/revlogutils/docket.py
@@ -32,9 +32,11 @@ 
 # * 4 bytes: revlog version
 #          |   This is mandatory as docket must be compatible with the previous
 #          |   revlog index header.
-# * 8 bytes: size of index data
-# * 8 bytes: pending size of index data
-S_HEADER = struct.Struct(constants.INDEX_HEADER.format + 'LL')
+# * 8 bytes: size of index-data
+# * 8 bytes: pending size of index-data
+# * 8 bytes: size of data
+# * 8 bytes: pending size of data
+S_HEADER = struct.Struct(constants.INDEX_HEADER.format + 'LLLL')
 
 
 class RevlogDocket(object):
@@ -47,6 +49,8 @@ 
         version_header=None,
         index_end=0,
         pending_index_end=0,
+        data_end=0,
+        pending_data_end=0,
     ):
         self._version_header = version_header
         self._read_only = bool(use_pending)
@@ -54,14 +58,19 @@ 
         self._radix = revlog.radix
         self._path = revlog._docket_file
         self._opener = revlog.opener
-        # this assert should be True as long as we have a single index filename
+        # thes asserts should be True as long as we have a single index filename
         assert index_end <= pending_index_end
+        assert data_end <= pending_data_end
         self._initial_index_end = index_end
         self._pending_index_end = pending_index_end
+        self._initial_data_end = data_end
+        self._pending_data_end = pending_data_end
         if use_pending:
             self._index_end = self._pending_index_end
+            self._data_end = self._pending_data_end
         else:
             self._index_end = self._initial_index_end
+            self._data_end = self._initial_data_end
 
     def index_filepath(self):
         """file path to the current index file associated to this docket"""
@@ -78,6 +87,16 @@ 
             self._index_end = new_size
             self._dirty = True
 
+    @property
+    def data_end(self):
+        return self._data_end
+
+    @data_end.setter
+    def data_end(self, new_size):
+        if new_size != self._data_end:
+            self._data_end = new_size
+            self._dirty = True
+
     def write(self, transaction, pending=False, stripping=False):
         """write the modification of disk if any
 
@@ -102,15 +121,19 @@ 
     def _serialize(self, pending=False):
         if pending:
             official_index_end = self._initial_index_end
+            official_data_end = self._initial_data_end
         else:
             official_index_end = self._index_end
+            official_data_end = self._data_end
 
         # this assert should be True as long as we have a single index filename
-        assert official_index_end <= self._index_end
+        assert official_data_end <= self._data_end
         data = (
             self._version_header,
             official_index_end,
             self._index_end,
+            official_data_end,
+            self._data_end,
         )
         return S_HEADER.pack(*data)
 
@@ -127,12 +150,18 @@ 
 def parse_docket(revlog, data, use_pending=False):
     """given some docket data return a docket object for the given revlog"""
     header = S_HEADER.unpack(data[: S_HEADER.size])
-    version_header, index_size, pending_index_size = header
+    version_header = header[0]
+    index_size = header[1]
+    pending_index_size = header[2]
+    data_size = header[3]
+    pending_data_size = header[4]
     docket = RevlogDocket(
         revlog,
         use_pending=use_pending,
         version_header=version_header,
         index_end=index_size,
         pending_index_end=pending_index_size,
+        data_end=data_size,
+        pending_data_end=pending_data_size,
     )
     return docket
diff --git a/mercurial/revlog.py b/mercurial/revlog.py
--- a/mercurial/revlog.py
+++ b/mercurial/revlog.py
@@ -2088,7 +2088,10 @@ 
             if not self._inline:
                 try:
                     dfh = self._datafp(b"r+")
-                    dfh.seek(0, os.SEEK_END)
+                    if self._docket is None:
+                        dfh.seek(0, os.SEEK_END)
+                    else:
+                        dfh.seek(self._docket.data_end, os.SEEK_SET)
                 except IOError as inst:
                     if inst.errno != errno.ENOENT:
                         raise
@@ -2455,16 +2458,10 @@ 
         to `n - 1`'s sidedata being written after `n`'s data.
 
         TODO cache this in a docket file before getting out of experimental."""
-        if self._format_version != REVLOGV2:
+        if self._docket is None:
             return self.end(prev)
-
-        offset = 0
-        for rev, entry in enumerate(self.index):
-            sidedata_end = entry[8] + entry[9]
-            # Sidedata for a previous rev has potentially been written after
-            # this rev's end, so take the max.
-            offset = max(self.end(rev), offset, sidedata_end)
-        return offset
+        else:
+            return self._docket.data_end
 
     def _writeentry(self, transaction, entry, data, link, offset, sidedata):
         # Files opened in a+ mode have inconsistent behavior on various
@@ -2488,7 +2485,10 @@ 
         else:
             ifh.seek(self._docket.index_end, os.SEEK_SET)
         if dfh:
-            dfh.seek(0, os.SEEK_END)
+            if self._docket is None:
+                dfh.seek(0, os.SEEK_END)
+            else:
+                dfh.seek(self._docket.data_end, os.SEEK_SET)
 
         curr = len(self) - 1
         if not self._inline:
@@ -2511,6 +2511,7 @@ 
             self._enforceinlinesize(transaction)
         if self._docket is not None:
             self._docket.index_end = self._writinghandles[0].tell()
+            self._docket.data_end = self._writinghandles[1].tell()
 
         nodemaputil.setup_persistent_nodemap(transaction, self)
 
@@ -2673,18 +2674,19 @@ 
             return
 
         # first truncate the files on disk
-        end = self.start(rev)
+        data_end = self.start(rev)
         if not self._inline:
-            transaction.add(self._datafile, end)
+            transaction.add(self._datafile, data_end)
             end = rev * self.index.entry_size
         else:
-            end += rev * self.index.entry_size
+            end = data_end + (rev * self.index.entry_size)
 
         transaction.add(self._indexfile, end)
         if self._docket is not None:
             # XXX we could, leverage the docket while stripping. However it is
             # not powerfull enough at the time of this comment
             self._docket.index_end = end
+            self._docket.data_end = data_end
             self._docket.write(transaction, stripping=True)
 
         # then reset internal state in memory to forget those revisions
@@ -3210,7 +3212,11 @@ 
         # append the new sidedata
         with self._writing(transaction):
             ifh, dfh = self._writinghandles
-            dfh.seek(0, os.SEEK_END)
+            if self._docket is not None:
+                dfh.seek(self._docket.data_end, os.SEEK_SET)
+            else:
+                dfh.seek(0, os.SEEK_END)
+
             current_offset = dfh.tell()
             for rev in range(startrev, endrev + 1):
                 entry = self.index[rev]
@@ -3242,6 +3248,8 @@ 
                 dfh.write(serialized_sidedata)
                 new_entries.append(entry)
                 current_offset += len(serialized_sidedata)
+                if self._docket is not None:
+                    self._docket.data_end = dfh.tell()
 
             # rewrite the new index entries
             ifh.seek(startrev * self.index.entry_size)
diff --git a/mercurial/configitems.py b/mercurial/configitems.py
--- a/mercurial/configitems.py
+++ b/mercurial/configitems.py
@@ -1151,8 +1151,6 @@ 
 #      - for stripping operation
 #      - for rollback operation
 # * proper streaming (race free) of the docker file
-# * to grow a docket file to at least store the last offset of the data
-#   file when rewriting sidedata.
 # * need a way of dealing with garbage data if we allow rewriting
 #   *existing* sidedata.
 # * Exchange-wise, we will also need to do something more efficient than