Patchwork D11097: dirstate-v2: Add heuristic for when to a new data file

login
register
mail settings
Submitter phabricator
Date July 15, 2021, 3:29 p.m.
Message ID <differential-rev-PHID-DREV-jqmbff6ujl6femx6nr5z-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/49410/
State Superseded
Headers show

Comments

phabricator - July 15, 2021, 3:29 p.m.
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  … instead of appending to the existing one. This is based on keeping track
  of how much of the existing data is not used anymore.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11097

AFFECTED FILES
  rust/hg-core/src/dirstate_tree/dirstate_map.rs
  rust/hg-core/src/dirstate_tree/on_disk.rs

CHANGE DETAILS




To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs
@@ -283,6 +283,8 @@ 
         return Ok(DirstateMap::empty(on_disk));
     }
     let root = read_root(on_disk)?;
+    // Each append writes a new `Root`, so it’s never reused
+    let unreachable_bytes = std::mem::size_of::<Root>();
     let dirstate_map = DirstateMap {
         on_disk,
         root: dirstate_map::ChildNodes::OnDisk(read_nodes(
@@ -292,6 +294,7 @@ 
         nodes_with_entry_count: root.nodes_with_entry_count.get(),
         nodes_with_copy_source_count: root.nodes_with_copy_source_count.get(),
         ignore_patterns_hash: root.ignore_patterns_hash,
+        unreachable_bytes,
     };
     Ok(dirstate_map)
 }
diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
@@ -44,6 +44,9 @@ 
 
     /// See on_disk::Header
     pub(super) ignore_patterns_hash: on_disk::IgnorePatternsHash,
+
+    /// How many bytes of `on_disk` are not used anymore
+    pub(super) unreachable_bytes: usize,
 }
 
 /// Using a plain `HgPathBuf` of the full path from the repository root as a
@@ -118,9 +121,10 @@ 
         }
     }
 
-    pub(super) fn make_mut(
+    fn make_mut(
         &mut self,
         on_disk: &'on_disk [u8],
+        unreachable_bytes: &mut usize,
     ) -> Result<
         &mut FastHashMap<NodeKey<'on_disk>, Node<'on_disk>>,
         DirstateV2ParseError,
@@ -128,6 +132,8 @@ 
         match self {
             ChildNodes::InMemory(nodes) => Ok(nodes),
             ChildNodes::OnDisk(nodes) => {
+                *unreachable_bytes +=
+                    std::mem::size_of_val::<[on_disk::Node]>(nodes);
                 let nodes = nodes
                     .iter()
                     .map(|node| {
@@ -406,6 +412,7 @@ 
             nodes_with_entry_count: 0,
             nodes_with_copy_source_count: 0,
             ignore_patterns_hash: [0; on_disk::IGNORE_PATTERNS_HASH_LEN],
+            unreachable_bytes: 0,
         }
     }
 
@@ -436,6 +443,7 @@ 
                 let tracked = entry.state.is_tracked();
                 let node = Self::get_or_insert_node(
                     map.on_disk,
+                    &mut map.unreachable_bytes,
                     &mut map.root,
                     path,
                     WithBasename::to_cow_borrowed,
@@ -472,18 +480,8 @@ 
     /// append to the existing data file that contains `self.on_disk` (true),
     /// or create a new data file from scratch (false).
     pub(super) fn write_should_append(&self) -> bool {
-        // Soon this will be a heuristic based on the amount of unreachable
-        // data. For now it’s pseudo-random in order to make tests exercise
-        // both code paths.
-
-        fn bad_rng() -> u32 {
-            std::time::SystemTime::now()
-                .duration_since(std::time::UNIX_EPOCH)
-                .unwrap()
-                .subsec_millis()
-        }
-
-        bad_rng() % 2 == 0
+        // Append if more than half of the existing data is still relevant
+        self.unreachable_bytes <= (self.on_disk.len() / 2)
     }
 
     fn get_node<'tree>(
@@ -514,6 +512,7 @@ 
     /// other fields while the returned borrow is still valid
     fn get_node_mut<'tree>(
         on_disk: &'on_disk [u8],
+        unreachable_bytes: &mut usize,
         root: &'tree mut ChildNodes<'on_disk>,
         path: &HgPath,
     ) -> Result<Option<&'tree mut Node<'on_disk>>, DirstateV2ParseError> {
@@ -522,7 +521,9 @@ 
         let mut component =
             components.next().expect("expected at least one components");
         loop {
-            if let Some(child) = children.make_mut(on_disk)?.get_mut(component)
+            if let Some(child) = children
+                .make_mut(on_disk, unreachable_bytes)?
+                .get_mut(component)
             {
                 if let Some(next_component) = components.next() {
                     component = next_component;
@@ -542,6 +543,7 @@ 
     ) -> Result<&'tree mut Node<'on_disk>, DirstateV2ParseError> {
         Self::get_or_insert_node(
             self.on_disk,
+            &mut self.unreachable_bytes,
             &mut self.root,
             path,
             WithBasename::to_cow_owned,
@@ -549,8 +551,9 @@ 
         )
     }
 
-    pub(super) fn get_or_insert_node<'tree, 'path>(
+    fn get_or_insert_node<'tree, 'path>(
         on_disk: &'on_disk [u8],
+        unreachable_bytes: &mut usize,
         root: &'tree mut ChildNodes<'on_disk>,
         path: &'path HgPath,
         to_cow: impl Fn(
@@ -569,7 +572,7 @@ 
             // map already contains that key, without introducing double
             // lookup?
             let child_node = child_nodes
-                .make_mut(on_disk)?
+                .make_mut(on_disk, unreachable_bytes)?
                 .entry(to_cow(ancestor_path))
                 .or_default();
             if let Some(next) = inclusive_ancestor_paths.next() {
@@ -598,6 +601,7 @@ 
 
         let node = Self::get_or_insert_node(
             self.on_disk,
+            &mut self.unreachable_bytes,
             &mut self.root,
             path,
             WithBasename::to_cow_owned,
@@ -681,6 +685,7 @@ 
         for path in paths {
             if let Some(node) = Self::get_node_mut(
                 self.on_disk,
+                &mut self.unreachable_bytes,
                 &mut self.root,
                 path.as_ref(),
             )? {
@@ -712,6 +717,12 @@ 
             Ok(None)
         })
     }
+
+    fn count_dropped_path(unreachable_bytes: &mut usize, path: &Cow<HgPath>) {
+        if let Cow::Borrowed(path) = path {
+            *unreachable_bytes += path.len()
+        }
+    }
 }
 
 /// Like `Iterator::filter_map`, but over a fallible iterator of `Result`s.
@@ -844,13 +855,14 @@ 
         ///   removed a node in `nodes`.
         fn recur<'on_disk>(
             on_disk: &'on_disk [u8],
+            unreachable_bytes: &mut usize,
             nodes: &mut ChildNodes<'on_disk>,
             path: &HgPath,
         ) -> Result<Option<(Dropped, bool)>, DirstateV2ParseError> {
             let (first_path_component, rest_of_path) =
                 path.split_first_component();
-            let node = if let Some(node) =
-                nodes.make_mut(on_disk)?.get_mut(first_path_component)
+            let nodes = nodes.make_mut(on_disk, unreachable_bytes)?;
+            let node = if let Some(node) = nodes.get_mut(first_path_component)
             {
                 node
             } else {
@@ -858,9 +870,12 @@ 
             };
             let dropped;
             if let Some(rest) = rest_of_path {
-                if let Some((d, removed)) =
-                    recur(on_disk, &mut node.children, rest)?
-                {
+                if let Some((d, removed)) = recur(
+                    on_disk,
+                    unreachable_bytes,
+                    &mut node.children,
+                    rest,
+                )? {
                     dropped = d;
                     if dropped.had_entry {
                         node.descendants_with_entry_count -= 1;
@@ -884,6 +899,9 @@ 
                 if had_entry {
                     node.data = NodeData::None
                 }
+                if let Some(source) = &node.copy_source {
+                    DirstateMap::count_dropped_path(unreachable_bytes, source)
+                }
                 dropped = Dropped {
                     was_tracked: node
                         .data
@@ -899,14 +917,22 @@ 
                 && node.copy_source.is_none()
                 && node.children.is_empty();
             if remove {
-                nodes.make_mut(on_disk)?.remove(first_path_component);
+                let (key, _) =
+                    nodes.remove_entry(first_path_component).unwrap();
+                DirstateMap::count_dropped_path(
+                    unreachable_bytes,
+                    key.full_path(),
+                )
             }
             Ok(Some((dropped, remove)))
         }
 
-        if let Some((dropped, _removed)) =
-            recur(self.on_disk, &mut self.root, filename)?
-        {
+        if let Some((dropped, _removed)) = recur(
+            self.on_disk,
+            &mut self.unreachable_bytes,
+            &mut self.root,
+            filename,
+        )? {
             if dropped.had_entry {
                 self.nodes_with_entry_count -= 1
             }
@@ -926,9 +952,12 @@ 
         now: i32,
     ) -> Result<(), DirstateV2ParseError> {
         for filename in filenames {
-            if let Some(node) =
-                Self::get_node_mut(self.on_disk, &mut self.root, &filename)?
-            {
+            if let Some(node) = Self::get_node_mut(
+                self.on_disk,
+                &mut self.unreachable_bytes,
+                &mut self.root,
+                &filename,
+            )? {
                 if let NodeData::Entry(entry) = &mut node.data {
                     entry.clear_ambiguous_mtime(now);
                 }
@@ -1144,16 +1173,20 @@ 
         key: &HgPath,
     ) -> Result<Option<HgPathBuf>, DirstateV2ParseError> {
         let count = &mut self.nodes_with_copy_source_count;
-        Ok(
-            Self::get_node_mut(self.on_disk, &mut self.root, key)?.and_then(
-                |node| {
-                    if node.copy_source.is_some() {
-                        *count -= 1
-                    }
-                    node.copy_source.take().map(Cow::into_owned)
-                },
-            ),
-        )
+        let unreachable_bytes = &mut self.unreachable_bytes;
+        Ok(Self::get_node_mut(
+            self.on_disk,
+            unreachable_bytes,
+            &mut self.root,
+            key,
+        )?
+        .and_then(|node| {
+            if let Some(source) = &node.copy_source {
+                *count -= 1;
+                Self::count_dropped_path(unreachable_bytes, source);
+            }
+            node.copy_source.take().map(Cow::into_owned)
+        }))
     }
 
     fn copy_map_insert(
@@ -1163,6 +1196,7 @@ 
     ) -> Result<Option<HgPathBuf>, DirstateV2ParseError> {
         let node = Self::get_or_insert_node(
             self.on_disk,
+            &mut self.unreachable_bytes,
             &mut self.root,
             &key,
             WithBasename::to_cow_owned,