Patchwork D11094: dirstate-v2: Support appending to the same data file

login
register
mail settings
Submitter phabricator
Date July 15, 2021, 3:29 p.m.
Message ID <differential-rev-PHID-DREV-xxcjutujmgvzwqlc26hg-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/49411/
State Superseded
Headers show

Comments

phabricator - July 15, 2021, 3:29 p.m.
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  For now we’re still writing the entire data every time, so appending is not
  useful yet. Later we’ll have new nodes pointing to some existing data for
  nodes and paths that haven’t changed.
  
  The decision whether to append is pseudo-random in order to make tests exercise
  both code paths. This will be replaced by a heuristic based on the amount
  of unused existing data.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11094

AFFECTED FILES
  mercurial/dirstatemap.py
  rust/hg-core/src/dirstate_tree/dirstate_map.rs
  rust/hg-core/src/dirstate_tree/dispatch.rs
  rust/hg-core/src/dirstate_tree/on_disk.rs
  rust/hg-cpython/src/dirstate/dirstate_map.rs
  rust/hg-cpython/src/dirstate/dispatch.rs

CHANGE DETAILS




To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-cpython/src/dirstate/dispatch.rs b/rust/hg-cpython/src/dirstate/dispatch.rs
--- a/rust/hg-cpython/src/dirstate/dispatch.rs
+++ b/rust/hg-cpython/src/dirstate/dispatch.rs
@@ -124,8 +124,12 @@ 
         self.get_mut().pack_v1(parents, now)
     }
 
-    fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError> {
-        self.get_mut().pack_v2(now)
+    fn pack_v2(
+        &mut self,
+        now: Timestamp,
+        can_append: bool,
+    ) -> Result<(Vec<u8>, bool), DirstateError> {
+        self.get_mut().pack_v2(now, can_append)
     }
 
     fn status<'a>(
diff --git a/rust/hg-cpython/src/dirstate/dirstate_map.rs b/rust/hg-cpython/src/dirstate/dirstate_map.rs
--- a/rust/hg-cpython/src/dirstate/dirstate_map.rs
+++ b/rust/hg-cpython/src/dirstate/dirstate_map.rs
@@ -340,16 +340,23 @@ 
         }
     }
 
+    /// Returns new data together with whether that data should be appended to
+    /// the existing data file whose content is at `self.on_disk` (True),
+    /// instead of written to a new data file (False).
     def write_v2(
         &self,
-        now: PyObject
-    ) -> PyResult<PyBytes> {
+        now: PyObject,
+        can_append: bool,
+    ) -> PyResult<PyObject> {
         let now = Timestamp(now.extract(py)?);
 
         let mut inner = self.inner(py).borrow_mut();
-        let result = inner.pack_v2(now);
+        let result = inner.pack_v2(now, can_append);
         match result {
-            Ok(packed) => Ok(PyBytes::new(py, &packed)),
+            Ok((packed, append)) => {
+                let packed = PyBytes::new(py, &packed);
+                Ok((packed, append).to_py_object(py).into_object())
+            },
             Err(_) => Err(PyErr::new::<exc::OSError, _>(
                 py,
                 "Dirstate error".to_string(),
diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs
@@ -544,20 +544,28 @@ 
     recur(on_disk, root.root_nodes, &mut f)
 }
 
+/// Returns new data together with whether that data should be appended to the
+/// existing data file whose content is at `dirstate_map.on_disk` (true),
+/// instead of written to a new data file (false).
 pub(super) fn write(
     dirstate_map: &mut DirstateMap,
-) -> Result<Vec<u8>, DirstateError> {
-    let root_len = std::mem::size_of::<Root>();
+    can_append: bool,
+) -> Result<(Vec<u8>, bool), DirstateError> {
+    let append = can_append && dirstate_map.write_should_append();
 
     // This ignores the space for paths, and for nodes without an entry.
     // TODO: better estimate? Skip the `Vec` and write to a file directly?
-    let size_guess = root_len
+    let size_guess = std::mem::size_of::<Root>()
         + std::mem::size_of::<Node>()
             * dirstate_map.nodes_with_entry_count as usize;
-    let mut out = Vec::with_capacity(size_guess);
 
-    let root_nodes =
-        write_nodes(dirstate_map, dirstate_map.root.as_ref(), &mut out)?;
+    let mut writer = Writer {
+        dirstate_map,
+        append,
+        out: Vec::with_capacity(size_guess),
+    };
+
+    let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?;
 
     let root = Root {
         root_nodes,
@@ -567,112 +575,121 @@ 
             .into(),
         ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
     };
-    out.extend(root.as_bytes());
-    Ok(out)
+    writer.out.extend(root.as_bytes());
+    Ok((writer.out, append))
+}
+
+struct Writer<'dmap, 'on_disk> {
+    dirstate_map: &'dmap DirstateMap<'on_disk>,
+    append: bool,
+    out: Vec<u8>,
 }
 
-fn write_nodes(
-    dirstate_map: &DirstateMap,
-    nodes: dirstate_map::ChildNodesRef,
-    out: &mut Vec<u8>,
-) -> Result<ChildNodes, DirstateError> {
-    // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
-    // order. Sort to enable binary search in the written file.
-    let nodes = nodes.sorted();
-    let nodes_len = nodes.len();
+impl Writer<'_, '_> {
+    fn write_nodes(
+        &mut self,
+        nodes: dirstate_map::ChildNodesRef,
+    ) -> Result<ChildNodes, DirstateError> {
+        // `dirstate_map::ChildNodes` is a `HashMap` with undefined iteration
+        // order. Sort to enable binary search in the written file.
+        let nodes = nodes.sorted();
+        let nodes_len = nodes.len();
 
-    // First accumulate serialized nodes in a `Vec`
-    let mut on_disk_nodes = Vec::with_capacity(nodes_len);
-    for node in nodes {
-        let children = write_nodes(
-            dirstate_map,
-            node.children(dirstate_map.on_disk)?,
-            out,
-        )?;
-        let full_path = node.full_path(dirstate_map.on_disk)?;
-        let full_path = write_path(full_path.as_bytes(), out);
-        let copy_source =
-            if let Some(source) = node.copy_source(dirstate_map.on_disk)? {
-                write_path(source.as_bytes(), out)
+        // First accumulate serialized nodes in a `Vec`
+        let mut on_disk_nodes = Vec::with_capacity(nodes_len);
+        for node in nodes {
+            let children =
+                self.write_nodes(node.children(self.dirstate_map.on_disk)?)?;
+            let full_path = node.full_path(self.dirstate_map.on_disk)?;
+            let full_path = self.write_path(full_path.as_bytes());
+            let copy_source = if let Some(source) =
+                node.copy_source(self.dirstate_map.on_disk)?
+            {
+                self.write_path(source.as_bytes())
             } else {
                 PathSlice {
                     start: 0.into(),
                     len: 0.into(),
                 }
             };
-        on_disk_nodes.push(match node {
-            NodeRef::InMemory(path, node) => {
-                let (state, data) = match &node.data {
-                    dirstate_map::NodeData::Entry(entry) => (
-                        entry.state.into(),
-                        Entry {
-                            mode: entry.mode.into(),
-                            mtime: entry.mtime.into(),
-                            size: entry.size.into(),
-                        },
-                    ),
-                    dirstate_map::NodeData::CachedDirectory { mtime } => {
-                        (b'd', Entry::from_timestamp(*mtime))
+            on_disk_nodes.push(match node {
+                NodeRef::InMemory(path, node) => {
+                    let (state, data) = match &node.data {
+                        dirstate_map::NodeData::Entry(entry) => (
+                            entry.state.into(),
+                            Entry {
+                                mode: entry.mode.into(),
+                                mtime: entry.mtime.into(),
+                                size: entry.size.into(),
+                            },
+                        ),
+                        dirstate_map::NodeData::CachedDirectory { mtime } => {
+                            (b'd', Entry::from_timestamp(*mtime))
+                        }
+                        dirstate_map::NodeData::None => (
+                            b'\0',
+                            Entry {
+                                mode: 0.into(),
+                                mtime: 0.into(),
+                                size: 0.into(),
+                            },
+                        ),
+                    };
+                    Node {
+                        children,
+                        copy_source,
+                        full_path,
+                        base_name_start: u16::try_from(path.base_name_start())
+                            // Could only panic for paths over 64 KiB
+                            .expect("dirstate-v2 path length overflow")
+                            .into(),
+                        descendants_with_entry_count: node
+                            .descendants_with_entry_count
+                            .into(),
+                        tracked_descendants_count: node
+                            .tracked_descendants_count
+                            .into(),
+                        state,
+                        data,
                     }
-                    dirstate_map::NodeData::None => (
-                        b'\0',
-                        Entry {
-                            mode: 0.into(),
-                            mtime: 0.into(),
-                            size: 0.into(),
-                        },
-                    ),
-                };
-                Node {
+                }
+                NodeRef::OnDisk(node) => Node {
                     children,
                     copy_source,
                     full_path,
-                    base_name_start: u16::try_from(path.base_name_start())
-                        // Could only panic for paths over 64 KiB
-                        .expect("dirstate-v2 path length overflow")
-                        .into(),
-                    descendants_with_entry_count: node
-                        .descendants_with_entry_count
-                        .into(),
-                    tracked_descendants_count: node
-                        .tracked_descendants_count
-                        .into(),
-                    state,
-                    data,
-                }
-            }
-            NodeRef::OnDisk(node) => Node {
-                children,
-                copy_source,
-                full_path,
-                ..*node
-            },
-        })
+                    ..*node
+                },
+            })
+        }
+        // … so we can write them contiguously, after writing everything else
+        // they refer to.
+        let start = self.curent_offset();
+        let len = u32::try_from(nodes_len)
+            // Could only panic with over 4 billion nodes
+            .expect("dirstate-v2 path length overflow")
+            .into();
+        self.out.extend(on_disk_nodes.as_bytes());
+        Ok(ChildNodes { start, len })
     }
-    // … so we can write them contiguously, after writing everything else they
-    // refer to.
-    let start = curent_offset(out);
-    let len = u32::try_from(nodes_len)
-        // Could only panic with over 4 billion nodes
-        .expect("dirstate-v2 path length overflow")
-        .into();
-    out.extend(on_disk_nodes.as_bytes());
-    Ok(ChildNodes { start, len })
-}
 
-fn curent_offset(out: &Vec<u8>) -> Offset {
-    u32::try_from(out.len())
-        // Could only panic for a dirstate file larger than 4 GiB
-        .expect("dirstate-v2 offset overflow")
-        .into()
-}
+    fn curent_offset(&mut self) -> Offset {
+        let mut offset = self.out.len();
+        if self.append {
+            offset += self.dirstate_map.on_disk.len()
+        }
+        u32::try_from(offset)
+            // Could only panic for a dirstate file larger than 4 GiB
+            .expect("dirstate-v2 offset overflow")
+            .into()
+    }
 
-fn write_path(slice: &[u8], out: &mut Vec<u8>) -> PathSlice {
-    let start = curent_offset(out);
-    let len = u16::try_from(slice.len())
-        // Could only panic for paths over 64 KiB
-        .expect("dirstate-v2 path length overflow")
-        .into();
-    out.extend(slice.as_bytes());
-    PathSlice { start, len }
+    fn write_path(&mut self, slice: &[u8]) -> PathSlice {
+        let start = self.curent_offset();
+        let len = u16::try_from(slice.len())
+            // Could only panic for paths over 64 KiB
+            .expect("dirstate-v2 path length overflow")
+            .into();
+        self.out.extend(slice.as_bytes());
+        PathSlice { start, len }
+    }
 }
diff --git a/rust/hg-core/src/dirstate_tree/dispatch.rs b/rust/hg-core/src/dirstate_tree/dispatch.rs
--- a/rust/hg-core/src/dirstate_tree/dispatch.rs
+++ b/rust/hg-core/src/dirstate_tree/dispatch.rs
@@ -179,11 +179,19 @@ 
 
     /// Clear mtimes that are ambigous with `now` (similar to
     /// `clear_ambiguous_times` but for all files in the dirstate map), and
-    /// serialize bytes to write the `.hg/dirstate` file to disk in dirstate-v2
+    /// serialize bytes to write a dirstate data file to disk in dirstate-v2
     /// format.
     ///
+    /// Returns new data together with whether that data should be appended to
+    /// the existing data file whose content is at `self.on_disk` (true),
+    /// instead of written to a new data file (false).
+    ///
     /// Note: this is only supported by the tree dirstate map.
-    fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError>;
+    fn pack_v2(
+        &mut self,
+        now: Timestamp,
+        can_append: bool,
+    ) -> Result<(Vec<u8>, bool), DirstateError>;
 
     /// Run the status algorithm.
     ///
@@ -383,7 +391,11 @@ 
         self.pack(parents, now)
     }
 
-    fn pack_v2(&mut self, _now: Timestamp) -> Result<Vec<u8>, DirstateError> {
+    fn pack_v2(
+        &mut self,
+        _now: Timestamp,
+        _can_append: bool,
+    ) -> Result<(Vec<u8>, bool), DirstateError> {
         panic!(
             "should have used dirstate_tree::DirstateMap to use the v2 format"
         )
diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
@@ -468,6 +468,24 @@ 
         Ok((map, parents))
     }
 
+    /// Assuming dirstate-v2 format, returns whether the next write should
+    /// append to the existing data file that contains `self.on_disk` (true),
+    /// or create a new data file from scratch (false).
+    pub(super) fn write_should_append(&self) -> bool {
+        // Soon this will be a heuristic based on the amount of unreachable
+        // data. For now it’s pseudo-random in order to make tests exercise
+        // both code paths.
+
+        fn bad_rng() -> u32 {
+            std::time::SystemTime::now()
+                .duration_since(std::time::UNIX_EPOCH)
+                .unwrap()
+                .subsec_millis()
+        }
+
+        bad_rng() % 2 == 0
+    }
+
     fn get_node<'tree>(
         &'tree self,
         path: &HgPath,
@@ -1043,8 +1061,15 @@ 
         Ok(packed)
     }
 
+    /// Returns new data together with whether that data should be appended to
+    /// the existing data file whose content is at `self.on_disk` (true),
+    /// instead of written to a new data file (false).
     #[timed]
-    fn pack_v2(&mut self, now: Timestamp) -> Result<Vec<u8>, DirstateError> {
+    fn pack_v2(
+        &mut self,
+        now: Timestamp,
+        can_append: bool,
+    ) -> Result<(Vec<u8>, bool), DirstateError> {
         // TODO: how do we want to handle this in 2038?
         let now: i32 = now.0.try_into().expect("time overflow");
         let mut paths = Vec::new();
@@ -1063,7 +1088,7 @@ 
 
         self.clear_known_ambiguous_mtimes(&paths)?;
 
-        on_disk::write(self)
+        on_disk::write(self, can_append)
     }
 
     fn status<'a>(
diff --git a/mercurial/dirstatemap.py b/mercurial/dirstatemap.py
--- a/mercurial/dirstatemap.py
+++ b/mercurial/dirstatemap.py
@@ -657,8 +657,30 @@ 
             return self._rustmap
 
         def write(self, tr, st, now):
-            if self._use_dirstate_v2:
-                packed = self._rustmap.write_v2(now)
+            if not self._use_dirstate_v2:
+                p1, p2 = self.parents()
+                packed = self._rustmap.write_v1(p1, p2, now)
+                st.write(packed)
+                st.close()
+                return
+
+            # We can only append to an existing data file if there is one
+            can_append = self.docket.uuid is not None
+            packed, append = self._rustmap.write_v2(now, can_append)
+            if append:
+                docket = self.docket
+                with self._opener(docket.data_filename(), b'ab') as fp:
+                    assert fp.tell() == docket.data_size, (
+                        fp.tell(),
+                        docket.data_size,
+                    )
+                    written = fp.write(packed)
+                    assert written == len(packed)
+                docket.data_size += len(packed)
+                docket.parents = self.parents()
+                st.write(docket.serialize())
+                st.close()
+            else:
                 old_docket = self.docket
                 new_docket = docketmod.DirstateDocket.with_new_uuid(
                     self.parents(), len(packed)
@@ -674,11 +696,8 @@ 
                 if old_docket.uuid:
                     self._opener.unlink(old_docket.data_filename())
                 self._docket = new_docket
-            else:
-                p1, p2 = self.parents()
-                packed = self._rustmap.write_v1(p1, p2, now)
-                st.write(packed)
-                st.close()
+            # Reload from the newly-written file
+            util.clearcachedproperty(self, b"_rustmap")
             self._dirtyparents = False
 
         @propertycache