Patchwork D10547: dirstate-tree: Add the new `status()` algorithm

login
register
mail settings
Submitter phabricator
Date May 3, 2021, 10:28 a.m.
Message ID <differential-rev-PHID-DREV-ssaix32f7eesuk34svlf-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/48873/
State Superseded
Headers show

Comments

phabricator - May 3, 2021, 10:28 a.m.
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  With the dirstate organized in a tree that mirrors the structure of the
  filesystem tree, we can traverse both trees at the same time in order to
  compare them. This is hopefully more efficient that building multiple
  big hashmaps for all of the repository’s contents.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D10547

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/dirstate.rs
  rust/hg-core/src/dirstate/status.rs
  rust/hg-core/src/dirstate_tree/dirstate_map.rs
  rust/hg-core/src/dirstate_tree/status.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/hg_path.rs

CHANGE DETAILS




To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-core/src/utils/hg_path.rs b/rust/hg-core/src/utils/hg_path.rs
--- a/rust/hg-core/src/utils/hg_path.rs
+++ b/rust/hg-core/src/utils/hg_path.rs
@@ -6,6 +6,7 @@ 
 // GNU General Public License version 2 or any later version.
 
 use std::borrow::Borrow;
+use std::borrow::Cow;
 use std::convert::TryFrom;
 use std::ffi::{OsStr, OsString};
 use std::fmt;
@@ -535,6 +536,24 @@ 
     }
 }
 
+impl From<HgPathBuf> for Cow<'_, HgPath> {
+    fn from(path: HgPathBuf) -> Self {
+        Cow::Owned(path)
+    }
+}
+
+impl<'a> From<&'a HgPath> for Cow<'a, HgPath> {
+    fn from(path: &'a HgPath) -> Self {
+        Cow::Borrowed(path)
+    }
+}
+
+impl<'a> From<&'a HgPathBuf> for Cow<'a, HgPath> {
+    fn from(path: &'a HgPathBuf) -> Self {
+        Cow::Borrowed(&**path)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/rust/hg-core/src/utils/files.rs b/rust/hg-core/src/utils/files.rs
--- a/rust/hg-core/src/utils/files.rs
+++ b/rust/hg-core/src/utils/files.rs
@@ -17,7 +17,7 @@ 
 use lazy_static::lazy_static;
 use same_file::is_same_file;
 use std::borrow::{Cow, ToOwned};
-use std::ffi::OsStr;
+use std::ffi::{OsStr, OsString};
 use std::fs::Metadata;
 use std::iter::FusedIterator;
 use std::ops::Deref;
@@ -53,6 +53,12 @@ 
     str.as_ref().as_bytes().to_vec()
 }
 
+#[cfg(unix)]
+pub fn get_bytes_from_os_string(str: OsString) -> Vec<u8> {
+    use std::os::unix::ffi::OsStringExt;
+    str.into_vec()
+}
+
 /// An iterator over repository path yielding itself and its ancestors.
 #[derive(Copy, Clone, Debug)]
 pub struct Ancestors<'a> {
diff --git a/rust/hg-core/src/dirstate_tree/status.rs b/rust/hg-core/src/dirstate_tree/status.rs
--- a/rust/hg-core/src/dirstate_tree/status.rs
+++ b/rust/hg-core/src/dirstate_tree/status.rs
@@ -1,17 +1,361 @@ 
+use crate::dirstate::status::IgnoreFnType;
+use crate::dirstate_tree::dirstate_map::ChildNodes;
 use crate::dirstate_tree::dirstate_map::DirstateMap;
+use crate::dirstate_tree::dirstate_map::Node;
+use crate::matchers::get_ignore_function;
 use crate::matchers::Matcher;
+use crate::utils::files::get_bytes_from_os_string;
+use crate::utils::hg_path::HgPath;
 use crate::DirstateStatus;
+use crate::EntryState;
+use crate::HgPathBuf;
 use crate::PatternFileWarning;
 use crate::StatusError;
 use crate::StatusOptions;
+use std::borrow::Cow;
+use std::io;
+use std::path::Path;
 use std::path::PathBuf;
 
-pub fn status<'a>(
-    _dmap: &'a mut DirstateMap,
-    _matcher: &'a (dyn Matcher + Sync),
-    _root_dir: PathBuf,
-    _ignore_files: Vec<PathBuf>,
-    _options: StatusOptions,
-) -> Result<(DirstateStatus<'a>, Vec<PatternFileWarning>), StatusError> {
-    todo!()
+/// Returns the status of the working directory compared to its parent
+/// changeset.
+///
+/// This algorithm is based on traversing the filesystem tree (`fs` in function
+/// and variable names) and dirstate tree at the same time. The core of this
+/// traversal is the recursive `traverse_fs_directory_and_dirstate` function
+/// and its use of `itertools::merge_join_by`. When reaching a path that only
+/// exists in one of the two trees, depending on information requested by
+/// `options` we may need to traverse the remaining subtree.
+pub fn status<'tree>(
+    dmap: &'tree mut DirstateMap,
+    matcher: &(dyn Matcher + Sync),
+    root_dir: PathBuf,
+    ignore_files: Vec<PathBuf>,
+    options: StatusOptions,
+) -> Result<(DirstateStatus<'tree>, Vec<PatternFileWarning>), StatusError> {
+    let (ignore_fn, warnings): (IgnoreFnType, _) =
+        if options.list_ignored || options.list_unknown {
+            get_ignore_function(ignore_files, &root_dir)?
+        } else {
+            (Box::new(|&_| true), vec![])
+        };
+
+    let mut common = StatusCommon {
+        options,
+        matcher,
+        ignore_fn,
+        outcome: DirstateStatus::default(),
+    };
+    let is_at_repo_root = true;
+    let hg_path = HgPath::new("");
+    let has_ignored_ancestor = false;
+    common.traverse_fs_directory_and_dirstate(
+        has_ignored_ancestor,
+        &mut dmap.root,
+        hg_path,
+        &root_dir,
+        is_at_repo_root,
+    );
+    Ok((common.outcome, warnings))
+}
+
+/// Bag of random things needed by various parts of the algorithm. Reduces the
+/// number of parameters passed to functions.
+struct StatusCommon<'tree, 'a> {
+    options: StatusOptions,
+    matcher: &'a (dyn Matcher + Sync),
+    ignore_fn: IgnoreFnType<'a>,
+    outcome: DirstateStatus<'tree>,
 }
+
+impl<'tree, 'a> StatusCommon<'tree, 'a> {
+    fn traverse_fs_directory_and_dirstate(
+        &mut self,
+        has_ignored_ancestor: bool,
+        dirstate_nodes: &'tree mut ChildNodes,
+        directory_hg_path: &HgPath,
+        fs_path: &Path,
+        is_at_repo_root: bool,
+    ) {
+        // TODO: handle I/O errors
+        let mut fs_entries =
+            DirEntry::read_dir(fs_path, is_at_repo_root).unwrap();
+
+        // `sort_unstable_by_key` doesn’t allow keys borrowing from the value:
+        // https://github.com/rust-lang/rust/issues/34162
+        fs_entries.sort_unstable_by(|e1, e2| e1.base_name.cmp(&e2.base_name));
+
+        // `merge_join_by` requires both its input iterators to be sorted.
+        // * `BTreeMap` iterates according to keys’ ordering by definition
+        // * `DirEntry::read_dir` sorts explicitly for this reason
+        for pair in itertools::merge_join_by(
+            dirstate_nodes,
+            &fs_entries,
+            |(full_path, _node), fs_entry| {
+                full_path.base_name().cmp(&fs_entry.base_name)
+            },
+        ) {
+            use itertools::EitherOrBoth::*;
+            match pair {
+                Both((hg_path, dirstate_node), fs_entry) => {
+                    self.traverse_fs_and_dirstate(
+                        fs_entry,
+                        hg_path.full_path(),
+                        dirstate_node,
+                        has_ignored_ancestor,
+                    );
+                }
+                Left((hg_path, dirstate_node)) => self.traverse_dirstate_only(
+                    hg_path.full_path(),
+                    dirstate_node,
+                ),
+                Right(fs_entry) => self.traverse_fs_only(
+                    has_ignored_ancestor,
+                    directory_hg_path,
+                    fs_entry,
+                ),
+            }
+        }
+    }
+
+    fn traverse_fs_and_dirstate(
+        &mut self,
+        fs_entry: &DirEntry,
+        hg_path: &'tree HgPath,
+        dirstate_node: &'tree mut Node,
+        has_ignored_ancestor: bool,
+    ) {
+        if fs_entry.metadata.is_dir() {
+            if self.options.collect_traversed_dirs {
+                self.outcome.traversed.push(hg_path.into())
+            }
+            // If we previously had a file here, it was removed (with
+            // `hg rm` or similar) or deleted before it could be
+            // replaced by a directory.
+            self.mark_removed_or_deleted_if_file(
+                hg_path,
+                dirstate_node.state(),
+            );
+            let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
+            let is_at_repo_root = false;
+            self.traverse_fs_directory_and_dirstate(
+                is_ignored,
+                &mut dirstate_node.children,
+                hg_path,
+                &fs_entry.full_path,
+                is_at_repo_root,
+            );
+        } else {
+            if self.matcher.matches(hg_path) {
+                let full_path = Cow::from(hg_path);
+                if let Some(entry) = &dirstate_node.entry {
+                    match entry.state {
+                        EntryState::Added => {
+                            self.outcome.added.push(full_path)
+                        }
+                        EntryState::Removed => {
+                            self.outcome.removed.push(full_path)
+                        }
+                        EntryState::Merged => {
+                            self.outcome.modified.push(full_path)
+                        }
+                        EntryState::Normal => {
+                            self.handle_normal_file(
+                                full_path,
+                                dirstate_node,
+                                entry,
+                                fs_entry,
+                            );
+                        }
+                        // This variant is not used in DirstateMap
+                        // nodes
+                        EntryState::Unknown => unreachable!(),
+                    }
+                } else {
+                    // `node.entry.is_none()` indicates a "directory"
+                    // node, but the filesystem has a file
+                    self.mark_unknown_or_ignored(
+                        has_ignored_ancestor,
+                        full_path,
+                    )
+                }
+            }
+
+            for (child_hg_path, child_node) in &mut dirstate_node.children {
+                self.traverse_dirstate_only(
+                    child_hg_path.full_path(),
+                    child_node,
+                )
+            }
+        }
+    }
+
+    /// A file with `EntryState::Normal` in the dirstate was found in the
+    /// filesystem
+    fn handle_normal_file(
+        &mut self,
+        full_path: Cow<'tree, HgPath>,
+        dirstate_node: &Node,
+        entry: &crate::DirstateEntry,
+        fs_entry: &DirEntry,
+    ) {
+        if dirstate_node.copy_source.is_some()
+            || entry.is_from_other_parent()
+            || (entry.size >= 0
+                && (entry.size as u32 != fs_entry.metadata.len() as u32
+                    || (self.options.check_exec
+                        && entry.mode_changed(&fs_entry.metadata))))
+        {
+            self.outcome.modified.push(full_path)
+        } else {
+            let mtime = mtime_seconds(&fs_entry.metadata);
+            if mtime as i32 != entry.mtime
+                || mtime == self.options.last_normal_time
+            {
+                self.outcome.unsure.push(full_path)
+            } else if self.options.list_clean {
+                self.outcome.clean.push(full_path)
+            }
+        }
+    }
+
+    /// A node in the dirstate tree has no corresponding filesystem entry
+    fn traverse_dirstate_only(
+        &mut self,
+        hg_path: &'tree HgPath,
+        dirstate_node: &'tree mut Node,
+    ) {
+        self.mark_removed_or_deleted_if_file(hg_path, dirstate_node.state());
+        for (child_hg_path, child_node) in &mut dirstate_node.children {
+            self.traverse_dirstate_only(child_hg_path.full_path(), child_node)
+        }
+    }
+
+    /// A node in the dirstate tree has no corresponding *file* on the
+    /// filesystem
+    ///
+    /// Does nothing on a "directory" node
+    fn mark_removed_or_deleted_if_file(
+        &mut self,
+        hg_path: &'tree HgPath,
+        dirstate_node_state: Option<EntryState>,
+    ) {
+        if self.matcher.matches(hg_path) {
+            match dirstate_node_state {
+                Some(EntryState::Removed) => {
+                    self.outcome.removed.push(hg_path.into())
+                }
+                Some(_) => self.outcome.deleted.push(hg_path.into()),
+                None => {}
+            }
+        }
+    }
+
+    /// Something in the filesystem has no corresponding dirstate node
+    fn traverse_fs_only(
+        &mut self,
+        has_ignored_ancestor: bool,
+        directory_hg_path: &HgPath,
+        fs_entry: &DirEntry,
+    ) {
+        let hg_path = directory_hg_path.join(&fs_entry.base_name);
+        if fs_entry.metadata.is_dir() {
+            let is_ignored =
+                has_ignored_ancestor || (self.ignore_fn)(&hg_path);
+            let traverse_children = if is_ignored {
+                // Descendants of an ignored directory are all ignored
+                self.options.list_ignored
+            } else {
+                // Descendants of an unknown directory may be either unknown or
+                // ignored
+                self.options.list_unknown || self.options.list_ignored
+            };
+            if traverse_children {
+                let is_at_repo_root = false;
+                // TODO: handle I/O errors
+                let children_fs_entries =
+                    DirEntry::read_dir(&fs_entry.full_path, is_at_repo_root)
+                        .unwrap();
+                for child_fs_entry in children_fs_entries {
+                    self.traverse_fs_only(
+                        is_ignored,
+                        &hg_path,
+                        &child_fs_entry,
+                    )
+                }
+            }
+            if self.options.collect_traversed_dirs {
+                self.outcome.traversed.push(hg_path.into())
+            }
+        } else if self.matcher.matches(&hg_path) {
+            self.mark_unknown_or_ignored(has_ignored_ancestor, hg_path.into())
+        }
+    }
+
+    fn mark_unknown_or_ignored(
+        &mut self,
+        has_ignored_ancestor: bool,
+        hg_path: Cow<'tree, HgPath>,
+    ) {
+        let is_ignored = has_ignored_ancestor || (self.ignore_fn)(&hg_path);
+        if is_ignored {
+            if self.options.list_ignored {
+                self.outcome.ignored.push(hg_path)
+            }
+        } else {
+            if self.options.list_unknown {
+                self.outcome.unknown.push(hg_path)
+            }
+        }
+    }
+}
+
+#[cfg(unix)] // TODO
+fn mtime_seconds(metadata: &std::fs::Metadata) -> i64 {
+    // Going through `Metadata::modified()` would be portable, but would take
+    // care to construct a `SystemTime` value with sub-second precision just
+    // for us to throw that away here.
+    use std::os::unix::fs::MetadataExt;
+    metadata.mtime()
+}
+
+struct DirEntry {
+    base_name: HgPathBuf,
+    full_path: PathBuf,
+    metadata: std::fs::Metadata,
+}
+
+impl DirEntry {
+    /// Returns sorted names and metadata of entries in the given directory.
+    ///
+    /// If a `.hg` sub-directory is encountered:
+    ///
+    /// * At the repository root, ignore that sub-directory
+    /// * Elsewhere, we’re listing the content of a sub-repo. Return an empty
+    ///   list instead.
+    fn read_dir(path: &Path, is_at_repo_root: bool) -> io::Result<Vec<Self>> {
+        let mut results = Vec::new();
+        for entry in path.read_dir()? {
+            let entry = entry?;
+            let metadata = entry.metadata()?;
+            let name = get_bytes_from_os_string(entry.file_name());
+            // FIXME don't do this when cached
+            if name == b".hg" {
+                if is_at_repo_root {
+                    // Skip the repo’s own .hg (might be a symlink)
+                    continue;
+                } else if metadata.is_dir() {
+                    // A .hg sub-directory at another location means a subrepo,
+                    // skip it entirely.
+                    return Ok(Vec::new());
+                }
+            }
+            results.push(DirEntry {
+                base_name: name.into(),
+                full_path: entry.path(),
+                metadata,
+            })
+        }
+        Ok(results)
+    }
+}
diff --git a/rust/hg-core/src/dirstate_tree/dirstate_map.rs b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs
@@ -27,7 +27,7 @@ 
 pub struct DirstateMap {
     parents: Option<DirstateParents>,
     dirty_parents: bool,
-    root: ChildNodes,
+    pub(super) root: ChildNodes,
 
     /// Number of nodes anywhere in the tree that have `.entry.is_some()`.
     nodes_with_entry_count: usize,
@@ -42,17 +42,17 @@ 
 /// path, so comparing full paths gives the same result as comparing base
 /// names. However `BTreeMap` would waste time always re-comparing the same
 /// string prefix.
-type ChildNodes = BTreeMap<WithBasename<HgPathBuf>, Node>;
+pub(super) type ChildNodes = BTreeMap<WithBasename<HgPathBuf>, Node>;
 
 /// Represents a file or a directory
 #[derive(Default)]
-struct Node {
+pub(super) struct Node {
     /// `None` for directories
-    entry: Option<DirstateEntry>,
+    pub(super) entry: Option<DirstateEntry>,
 
-    copy_source: Option<HgPathBuf>,
+    pub(super) copy_source: Option<HgPathBuf>,
 
-    children: ChildNodes,
+    pub(super) children: ChildNodes,
 
     /// How many (non-inclusive) descendants of this node are tracked files
     tracked_descendants_count: usize,
@@ -67,6 +67,10 @@ 
             false
         }
     }
+
+    pub(super) fn state(&self) -> Option<EntryState> {
+        self.entry.as_ref().map(|entry| entry.state)
+    }
 }
 
 /// `(full_path, entry, copy_source)`
diff --git a/rust/hg-core/src/dirstate/status.rs b/rust/hg-core/src/dirstate/status.rs
--- a/rust/hg-core/src/dirstate/status.rs
+++ b/rust/hg-core/src/dirstate/status.rs
@@ -95,7 +95,7 @@ 
 
 type IoResult<T> = std::io::Result<T>;
 
-/// `Box<dyn Trait>` is syntactic sugar for `Box<dyn Trait, 'static>`, so add
+/// `Box<dyn Trait>` is syntactic sugar for `Box<dyn Trait + 'static>`, so add
 /// an explicit lifetime here to not fight `'static` bounds "out of nowhere".
 pub type IgnoreFnType<'a> =
     Box<dyn for<'r> Fn(&'r HgPath) -> bool + Sync + 'a>;
@@ -255,7 +255,7 @@ 
     pub collect_traversed_dirs: bool,
 }
 
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct DirstateStatus<'a> {
     /// Tracked files whose contents have changed since the parent revision
     pub modified: Vec<HgPathCow<'a>>,
diff --git a/rust/hg-core/src/dirstate.rs b/rust/hg-core/src/dirstate.rs
--- a/rust/hg-core/src/dirstate.rs
+++ b/rust/hg-core/src/dirstate.rs
@@ -42,6 +42,19 @@ 
     pub fn is_from_other_parent(&self) -> bool {
         self.state == EntryState::Normal && self.size == SIZE_FROM_OTHER_PARENT
     }
+
+    // TODO: other platforms
+    #[cfg(unix)]
+    pub fn mode_changed(
+        &self,
+        filesystem_metadata: &std::fs::Metadata,
+    ) -> bool {
+        use std::os::unix::fs::MetadataExt;
+        const EXEC_BIT_MASK: u32 = 0o100;
+        let dirstate_exec_bit = (self.mode as u32) & EXEC_BIT_MASK;
+        let fs_exec_bit = filesystem_metadata.mode() & EXEC_BIT_MASK;
+        dirstate_exec_bit != fs_exec_bit
+    }
 }
 
 #[derive(BytesCast)]
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -14,6 +14,7 @@ 
 derive_more = "0.99"
 home = "0.5"
 im-rc = "15.0.*"
+itertools = "0.9"
 lazy_static = "1.4.0"
 rand = "0.7.3"
 rand_pcg = "0.2.1"
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -358,6 +358,7 @@ 
  "format-bytes",
  "home",
  "im-rc",
+ "itertools",
  "lazy_static",
  "log",
  "memmap",