Patchwork D10826: dirstate-v2: Skip readdir in status based on directory mtime

login
register
mail settings
Submitter phabricator
Date June 1, 2021, 4:52 p.m.
Message ID <differential-rev-PHID-DREV-rhbtlxzn4pxhdpovu4mx-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/49146/
State Superseded
Headers show

Comments

phabricator - June 1, 2021, 4:52 p.m.
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  When calling `read_dir` during `status` and the directory is found to be
  eligible for caching (see code comments), write the directory’s mtime to the
  dirstate. The presence of a directory mtime in the dirstate is meaningful
  and indicates eligibility.
  
  When an eligible directory mtime is found in the dirstate and `stat()` shows
  that the mtime has not changed, `status` can skip calling `read_dir` again
  and instead rely on the names of child nodes in the dirstate tree.
  
  The `tempfile` crate is used to create a temporary file in order to use its
  modification time as "current time" with the same truncation as other files
  and directories would have in their own modification time.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D10826

AFFECTED FILES
  rust/hg-core/Cargo.toml
  rust/hg-core/src/dirstate_tree/dirstate_map.rs
  rust/hg-core/src/dirstate_tree/on_disk.rs
  rust/hg-core/src/dirstate_tree/status.rs

CHANGE DETAILS




To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-core/src/dirstate_tree/status.rs b/rust/hg-core/src/dirstate_tree/status.rs
--- a/rust/hg-core/src/dirstate_tree/status.rs
+++ b/rust/hg-core/src/dirstate_tree/status.rs
@@ -2,8 +2,11 @@ 
 use crate::dirstate_tree::dirstate_map::BorrowedPath;
 use crate::dirstate_tree::dirstate_map::ChildNodesRef;
 use crate::dirstate_tree::dirstate_map::DirstateMap;
+use crate::dirstate_tree::dirstate_map::NodeData;
 use crate::dirstate_tree::dirstate_map::NodeRef;
 use crate::dirstate_tree::on_disk::DirstateV2ParseError;
+use crate::dirstate_tree::on_disk::Timestamp;
+use crate::dirstate_tree::path_with_basename::WithBasename;
 use crate::matchers::get_ignore_function;
 use crate::matchers::Matcher;
 use crate::utils::files::get_bytes_from_os_string;
@@ -18,10 +21,12 @@ 
 use crate::StatusOptions;
 use micro_timer::timed;
 use rayon::prelude::*;
+use std::borrow::Cow;
 use std::io;
 use std::path::Path;
 use std::path::PathBuf;
 use std::sync::Mutex;
+use std::time::SystemTime;
 
 /// Returns the status of the working directory compared to its parent
 /// changeset.
@@ -52,19 +57,45 @@ 
         options,
         matcher,
         ignore_fn,
-        outcome: Mutex::new(DirstateStatus::default()),
+        outcome: Default::default(),
+        cached_directory_mtimes_to_add: Default::default(),
+        filesystem_time_at_status_start: filesystem_now(&root_dir).ok(),
     };
     let is_at_repo_root = true;
     let hg_path = &BorrowedPath::OnDisk(HgPath::new(""));
     let has_ignored_ancestor = false;
+    let root_cached_mtime = None;
+    let root_dir_metadata = None;
+    // If the path we have for the repository root is a symlink, do follow it.
+    // (As opposed to symlinks within the working directory which are not
+    // followed, using `std::fs::symlink_metadata`.)
     common.traverse_fs_directory_and_dirstate(
         has_ignored_ancestor,
         dmap.root.as_ref(),
         hg_path,
         &root_dir,
+        root_dir_metadata,
+        root_cached_mtime,
         is_at_repo_root,
     )?;
-    Ok((common.outcome.into_inner().unwrap(), warnings))
+    let outcome = common.outcome.into_inner().unwrap();
+    let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap();
+    for (path, mtime) in &to_add {
+        let node = DirstateMap::get_or_insert_node(
+            dmap.on_disk,
+            &mut dmap.root,
+            path,
+            WithBasename::to_cow_owned,
+            |_| {},
+        )?;
+        match &node.data {
+            NodeData::Entry(_) => {} // Don’t overwrite an entry
+            NodeData::CachedDirectory { .. } | NodeData::None => {
+                node.data = NodeData::CachedDirectory { mtime: *mtime }
+            }
+        }
+    }
+    Ok((outcome, warnings))
 }
 
 /// Bag of random things needed by various parts of the algorithm. Reduces the
@@ -75,6 +106,12 @@ 
     matcher: &'a (dyn Matcher + Sync),
     ignore_fn: IgnoreFnType<'a>,
     outcome: Mutex<DirstateStatus<'on_disk>>,
+    cached_directory_mtimes_to_add:
+        Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
+
+    /// The current time at the start of the `status()` algorithm, as measured
+    /// and possibly truncated by the filesystem.
+    filesystem_time_at_status_start: Option<SystemTime>,
 }
 
 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
@@ -97,18 +134,54 @@ 
             .push((hg_path.to_owned().into(), BadMatch::OsError(errno)))
     }
 
+    /// If this returns true, we can get accurate results by only using
+    /// `symlink_metadata` for child nodes that exist in the dirstate and don’t
+    /// need to call `read_dir`.
+    fn can_skip_fs_readdir(
+        &self,
+        directory_metadata: Option<&std::fs::Metadata>,
+        cached_directory_mtime: Option<&Timestamp>,
+    ) -> bool {
+        if !self.options.list_unknown && !self.options.list_ignored {
+            // All states that we care about listing have corresponding
+            // dirstate entries.
+            // This happens for example with `hg status -mard`.
+            return true;
+        }
+        if let Some(cached_mtime) = cached_directory_mtime {
+            // The dirstate contains a cached mtime for this directory, set by
+            // a previous run of the `status` algorithm which found this
+            // directory eligible for `read_dir` caching.
+            if let Some(meta) = directory_metadata {
+                if let Ok(current_mtime) = meta.modified() {
+                    if current_mtime == cached_mtime.into() {
+                        // The mtime of that directory has not changed since
+                        // then, which means that the
+                        // results of `read_dir` should also
+                        // be unchanged.
+                        return true;
+                    }
+                }
+            }
+        }
+        false
+    }
+
+    /// Returns whether the filesystem directory was found to have any entry
+    /// that does not have a corresponding dirstate tree node.
     fn traverse_fs_directory_and_dirstate(
         &self,
         has_ignored_ancestor: bool,
         dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
         directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
         directory_fs_path: &Path,
+        directory_metadata: Option<&std::fs::Metadata>,
+        cached_directory_mtime: Option<&Timestamp>,
         is_at_repo_root: bool,
-    ) -> Result<(), DirstateV2ParseError> {
-        if !self.options.list_unknown && !self.options.list_ignored {
-            // We only care about files in the dirstate, so we can skip listing
-            // filesystem directories entirely.
-            return dirstate_nodes
+    ) -> Result<bool, DirstateV2ParseError> {
+        if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime)
+        {
+            dirstate_nodes
                 .par_iter()
                 .map(|dirstate_node| {
                     let fs_path = directory_fs_path.join(get_path_from_bytes(
@@ -131,7 +204,13 @@ 
                         }
                     }
                 })
-                .collect();
+                .collect::<Result<_, _>>()?;
+
+            // Conservatively don’t let the caller assume that there aren’t
+            // any, since we don’t know.
+            let directory_has_any_fs_only_entry = true;
+
+            return Ok(directory_has_any_fs_only_entry);
         }
 
         let mut fs_entries = if let Ok(entries) = self.read_dir(
@@ -174,6 +253,7 @@ 
         .par_bridge()
         .map(|pair| {
             use itertools::EitherOrBoth::*;
+            let is_fs_only = pair.is_right();
             match pair {
                 Both(dirstate_node, fs_entry) => self
                     .traverse_fs_and_dirstate(
@@ -181,18 +261,19 @@ 
                         &fs_entry.metadata,
                         dirstate_node,
                         has_ignored_ancestor,
-                    ),
+                    )?,
                 Left(dirstate_node) => {
-                    self.traverse_dirstate_only(dirstate_node)
+                    self.traverse_dirstate_only(dirstate_node)?
                 }
-                Right(fs_entry) => Ok(self.traverse_fs_only(
+                Right(fs_entry) => self.traverse_fs_only(
                     has_ignored_ancestor,
                     directory_hg_path,
                     fs_entry,
-                )),
+                ),
             }
+            Ok(is_fs_only)
         })
-        .collect()
+        .try_reduce(|| false, |a, b| Ok(a || b))
     }
 
     fn traverse_fs_and_dirstate(
@@ -224,12 +305,20 @@ 
             }
             let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
             let is_at_repo_root = false;
-            self.traverse_fs_directory_and_dirstate(
-                is_ignored,
-                dirstate_node.children(self.dmap.on_disk)?,
-                hg_path,
-                fs_path,
-                is_at_repo_root,
+            let directory_has_any_fs_only_entry = self
+                .traverse_fs_directory_and_dirstate(
+                    is_ignored,
+                    dirstate_node.children(self.dmap.on_disk)?,
+                    hg_path,
+                    fs_path,
+                    Some(fs_metadata),
+                    dirstate_node.cached_directory_mtime(),
+                    is_at_repo_root,
+                )?;
+            self.maybe_save_directory_mtime(
+                directory_has_any_fs_only_entry,
+                fs_metadata,
+                dirstate_node,
             )?
         } else {
             if file_or_symlink && self.matcher.matches(hg_path) {
@@ -274,6 +363,75 @@ 
         Ok(())
     }
 
+    fn maybe_save_directory_mtime(
+        &self,
+        directory_has_any_fs_only_entry: bool,
+        directory_metadata: &std::fs::Metadata,
+        dirstate_node: NodeRef<'tree, 'on_disk>,
+    ) -> Result<(), DirstateV2ParseError> {
+        if !directory_has_any_fs_only_entry {
+            // All filesystem directory entries from `read_dir` have a
+            // corresponding node in the dirstate, so we can reconstitute the
+            // names of those entries without calling `read_dir` again.
+            if let (Some(status_start), Ok(directory_mtime)) = (
+                &self.filesystem_time_at_status_start,
+                directory_metadata.modified(),
+            ) {
+                // Although the Rust standard library’s `SystemTime` type
+                // has nanosecond precision, the times reported for a
+                // directory’s (or file’s) modified time may have lower
+                // resolution based on the filesystem (for example ext3
+                // only stores integer seconds), kernel (see
+                // https://stackoverflow.com/a/14393315/1162888), etc.
+                if &directory_mtime >= status_start {
+                    // The directory was modified too recently, don’t cache its
+                    // `read_dir` results.
+                    //
+                    // A timeline like this is possible:
+                    //
+                    // 1. A change to this directory (direct child was
+                    //    added or removed) cause its mtime to be set
+                    //    (possibly truncated) to `directory_mtime`
+                    // 2. This `status` algorithm calls `read_dir`
+                    // 3. An other change is made to the same directory is
+                    //    made so that calling `read_dir` agin would give
+                    //    different results, but soon enough after 1. that
+                    //    the mtime stays the same
+                    //
+                    // On a system where the time resolution poor, this
+                    // scenario is not unlikely if all three steps are caused
+                    // by the same script.
+                } else {
+                    // We’ve observed (through `status_start`) that time has
+                    // “progressed” since `directory_mtime`, so any further
+                    // change to this directory is extremely likely to cause a
+                    // different mtime.
+                    //
+                    // Having the same mtime again is not entirely impossible
+                    // since the system clock is not monotonous. It could jump
+                    // backward to some point before `directory_mtime`, then a
+                    // directory change could potentially happen during exactly
+                    // the wrong tick.
+                    //
+                    // We deem this scenario (unlike the previous one) to be
+                    // unlikely enough in practice.
+                    let timestamp = directory_mtime.into();
+                    let cached = dirstate_node.cached_directory_mtime();
+                    if cached != Some(&timestamp) {
+                        let hg_path = dirstate_node
+                            .full_path_borrowed(self.dmap.on_disk)?
+                            .detach_from_tree();
+                        self.cached_directory_mtimes_to_add
+                            .lock()
+                            .unwrap()
+                            .push((hg_path, timestamp))
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
     /// A file with `EntryState::Normal` in the dirstate was found in the
     /// filesystem
     fn handle_normal_file(
@@ -505,3 +663,22 @@ 
         Ok(results)
     }
 }
+
+/// Return the `mtime` of a temporary file newly-created in the `.hg` directory
+/// of the give repository.
+///
+/// This is similar to `SystemTime::now()`, with the result truncated to the
+/// same time resolution as other files’ modification times. Using `.hg`
+/// instead of the system’s default temporary directory (such as `/tmp`) makes
+/// it more likely the temporary file is in the same disk partition as contents
+/// of the working directory, which can matter since different filesystems may
+/// store timestamps with different resolutions.
+///
+/// This may fail, typically if we lack write permissions. In that case we
+/// should continue the `status()` algoritm anyway and consider the current
+/// date/time to be unknown.
+fn filesystem_now(repo_root: &Path) -> Result<SystemTime, io::Error> {
+    tempfile::tempfile_in(repo_root.join(".hg"))?
+        .metadata()?
+        .modified()
+}
diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs
@@ -56,13 +56,31 @@ 
 
     /// Dependending on the value of `state`:
     ///
-    /// * A null byte: `data` represents nothing
+    /// * A null byte: `data` is not used.
+    ///
     /// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together
-    ///   represents a dirstate entry like in the v1 format.
+    ///   represent a dirstate entry like in the v1 format.
+    ///
     /// * A `d` ASCII byte: the bytes of `data` should instead be interpreted
     ///   as the `Timestamp` for the mtime of a cached directory.
     ///
-    /// TODO: document directory caching
+    ///   The presence of this state means that at some point, this path in
+    ///   the working directory was observed:
+    ///
+    ///   - To be a directory
+    ///   - With the modification time as given by `Timestamp`
+    ///   - That timestamp was already strictly in the past when observed,
+    ///     meaning that later changes cannot happen in the same clock tick
+    ///     and must cause a different modification time (unless the system
+    ///     clock jumps back and we get unlucky, which is not impossible but
+    ///     but deemed unlikely enough).
+    ///   - The directory did not contain any child entry that did not have a
+    ///     corresponding dirstate node.
+    ///
+    ///   This means that if `std::fs::symlink_metadata` later reports the
+    ///   same modification time, we don’t need to call `std::fs::read_dir`
+    ///   again for this directory and can iterate child dirstate nodes
+    ///   instead.
     state: u8,
     data: Entry,
 }
@@ -76,7 +94,7 @@ 
 }
 
 /// Duration since the Unix epoch
-#[derive(BytesCast, Copy, Clone)]
+#[derive(BytesCast, Copy, Clone, PartialEq)]
 #[repr(C)]
 pub(super) struct Timestamp {
     seconds: I64Be,
@@ -258,6 +276,14 @@ 
         }
     }
 
+    pub(super) fn cached_directory_mtime(&self) -> Option<&Timestamp> {
+        if self.state == b'd' {
+            Some(self.data.as_timestamp())
+        } else {
+            None
+        }
+    }
+
     pub(super) fn state(
         &self,
     ) -> Result<Option<EntryState>, DirstateV2ParseError> {
@@ -326,8 +352,8 @@ 
     }
 }
 
-impl From<&'_ SystemTime> for Timestamp {
-    fn from(system_time: &'_ SystemTime) -> Self {
+impl From<SystemTime> for Timestamp {
+    fn from(system_time: SystemTime) -> Self {
         let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) {
             Ok(duration) => {
                 (duration.as_secs() as i64, duration.subsec_nanos())
diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
@@ -317,6 +317,18 @@ 
         }
     }
 
+    pub(super) fn cached_directory_mtime(
+        &self,
+    ) -> Option<&on_disk::Timestamp> {
+        match self {
+            NodeRef::InMemory(_path, node) => match &node.data {
+                NodeData::CachedDirectory { mtime } => Some(mtime),
+                _ => None,
+            },
+            NodeRef::OnDisk(node) => node.cached_directory_mtime(),
+        }
+    }
+
     pub(super) fn tracked_descendants_count(&self) -> u32 {
         match self {
             NodeRef::InMemory(_path, node) => node.tracked_descendants_count,
@@ -479,7 +491,7 @@ 
         }
     }
 
-    fn get_or_insert_node<'tree, 'path>(
+    pub(super) fn get_or_insert_node<'tree, 'path>(
         on_disk: &'on_disk [u8],
         root: &'tree mut ChildNodes<'on_disk>,
         path: &'path HgPath,
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -23,6 +23,7 @@ 
 regex = "1.3.9"
 twox-hash = "1.5.0"
 same-file = "1.0.6"
+tempfile = "3.1.0"
 crossbeam-channel = "0.4"
 micro-timer = "0.3.0"
 log = "0.4.8"
@@ -41,4 +42,3 @@ 
 [dev-dependencies]
 clap = "*"
 pretty_assertions = "0.6.1"
-tempfile = "3.1.0"