Patchwork D11731: rhg: lazily get filesystem metadata

login
register
mail settings
Submitter phabricator
Date Nov. 2, 2021, 12:06 p.m.
Message ID <differential-rev-PHID-DREV-mpjy327wngvmejmtrxr7-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/50056/
State New
Headers show

Comments

phabricator - Nov. 2, 2021, 12:06 p.m.
aalekseyev created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  rhg: use openat library for more efficient filesystem access

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11731

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/dirstate/entry.rs
  rust/hg-core/src/dirstate_tree/status.rs

CHANGE DETAILS




To: aalekseyev, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-core/src/dirstate_tree/status.rs b/rust/hg-core/src/dirstate_tree/status.rs
--- a/rust/hg-core/src/dirstate_tree/status.rs
+++ b/rust/hg-core/src/dirstate_tree/status.rs
@@ -8,13 +8,10 @@ 
 use crate::dirstate_tree::on_disk::DirstateV2ParseError;
 use crate::matchers::get_ignore_function;
 use crate::matchers::Matcher;
-use crate::utils::files::get_bytes_from_os_string;
-use crate::utils::files::get_path_from_bytes;
 use crate::utils::hg_path::HgPath;
 use crate::BadMatch;
 use crate::DirstateStatus;
 use crate::EntryState;
-use crate::HgPathBuf;
 use crate::PatternFileWarning;
 use crate::StatusError;
 use crate::StatusOptions;
@@ -27,6 +24,11 @@ 
 use std::path::PathBuf;
 use std::sync::Mutex;
 use std::time::SystemTime;
+use openat::{Dir};
+use std::ffi::CStr;
+use std::os::unix::ffi::OsStrExt;
+use std::ffi::CString;
+use std::convert::TryInto;
 
 /// Returns the status of the working directory compared to its parent
 /// changeset.
@@ -80,6 +82,7 @@ 
     // If the path we have for the repository root is a symlink, do follow it.
     // (As opposed to symlinks within the working directory which are not
     // followed, using `std::fs::symlink_metadata`.)
+    let root_dir = openat::Dir::open(&root_dir)?;
     common.traverse_fs_directory_and_dirstate(
         has_ignored_ancestor,
         dmap.root.as_ref(),
@@ -141,14 +144,94 @@ 
     filesystem_time_at_status_start: Option<SystemTime>,
 }
 
+enum MetadataOrType<'a> {
+    Metadata(&'a openat::Metadata),
+    FileType(openat::SimpleType)
+}
+
+struct FilesystemEntry<'a> {
+    dir : &'a Dir,
+    name : &'a CStr,
+    metadata_or_type : MetadataOrType<'a>
+}
+
+enum TORREF <'a, T> {
+   Owned(T),
+   Borrowed(&'a T)
+}
+
+impl <'a, T> TORREF <'a, T> {
+    fn borrow(&self) -> &T {
+        match self {
+            TORREF::Owned(t) => &t,
+            TORREF::Borrowed(r) => r
+        }
+    }
+}
+
+impl <'a> FilesystemEntry<'a> {
+
+    fn of_read_dir(dir : &'a Dir, entry : &'a openat::Entry) -> Self {
+        FilesystemEntry {
+           dir, name : <& openat::Entry as openat::AsPath>::to_path(entry).unwrap(), metadata_or_type : MetadataOrType::FileType(entry.simple_type().unwrap()) // TODO: not unwrap
+        }
+    }
+
+    fn of_metadata(dir : &'a Dir, name : &'a CStr, metadata : &'a openat::Metadata) -> Self {
+        FilesystemEntry {
+           dir, name, metadata_or_type : MetadataOrType::Metadata(metadata)
+        }
+    }
+
+    fn file_type(&self) -> openat::SimpleType {
+       match self.metadata_or_type {
+           MetadataOrType::Metadata(metadata) => metadata.simple_type (),
+           MetadataOrType::FileType(file_type) => file_type
+       }
+    }
+
+    // Returns None if the entry is not a directory
+    fn as_directory(&self) -> io::Result<Option<(Dir, openat::Metadata)>> {
+        match self.file_type() {
+            openat::SimpleType::Dir => {
+                let dir = self.dir.sub_dir(self.name)?;
+                let metadata = dir.self_metadata()?;
+                Ok(Some((dir, metadata)))
+            },
+            _ => Ok(None)
+        }
+    }
+
+    fn metadata(&self) -> io::Result<TORREF<'a, openat::Metadata>> {
+        match self.metadata_or_type {
+            MetadataOrType::Metadata(m) => Ok(TORREF::Borrowed(m)),
+            MetadataOrType::FileType(_) => {
+                let metadata = self.dir.metadata(self.name)?;
+                Ok(TORREF::Owned(metadata)) }
+        }
+    }
+}
+
+    fn stat_mtime(metadata : &openat::Metadata) -> SystemTime {
+        let stat = metadata.stat();
+        let nsec : i64 = stat.st_mtime_nsec as libc::c_long;
+        let nsec : u32 = nsec.try_into().unwrap();
+        let sec : i64 = stat.st_mtime as libc::time_t;
+        let sec : u64 = sec.try_into().unwrap();
+        let dur = std::time::Duration::new(sec, nsec);
+        SystemTime::UNIX_EPOCH.checked_add(dur).unwrap()
+    }
+
+
+
 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
     fn read_dir(
         &self,
         hg_path: &HgPath,
-        fs_path: &Path,
+        fs_dir: &Dir,
         is_at_repo_root: bool,
-    ) -> Result<Vec<DirEntry>, ()> {
-        DirEntry::read_dir(fs_path, is_at_repo_root)
+    ) -> Result<Vec<openat::Entry>, ()> {
+        read_dir(fs_dir, is_at_repo_root)
             .map_err(|error| self.io_error(error, hg_path))
     }
 
@@ -182,7 +265,7 @@ 
     /// need to call `read_dir`.
     fn can_skip_fs_readdir(
         &self,
-        directory_metadata: Option<&std::fs::Metadata>,
+        directory_metadata: Option<&openat::Metadata>,
         cached_directory_mtime: Option<TruncatedTimestamp>,
     ) -> bool {
         if !self.options.list_unknown && !self.options.list_ignored {
@@ -221,8 +304,8 @@ 
         has_ignored_ancestor: bool,
         dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
         directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
-        directory_fs_path: &Path,
-        directory_metadata: Option<&std::fs::Metadata>,
+        directory_fd: &Dir,
+        directory_metadata: Option<&openat::Metadata>,
         cached_directory_mtime: Option<TruncatedTimestamp>,
         is_at_repo_root: bool,
     ) -> Result<bool, DirstateV2ParseError> {
@@ -231,13 +314,11 @@ 
             dirstate_nodes
                 .par_iter()
                 .map(|dirstate_node| {
-                    let fs_path = directory_fs_path.join(get_path_from_bytes(
-                        dirstate_node.base_name(self.dmap.on_disk)?.as_bytes(),
-                    ));
-                    match std::fs::symlink_metadata(&fs_path) {
+                    let base_name = dirstate_node.base_name(self.dmap.on_disk)?;
+                    let base_name_cstring = CString::new(base_name.as_bytes()).expect("NULL BYTE");
+                    match directory_fd.metadata(base_name_cstring.as_ref()) {
                         Ok(fs_metadata) => self.traverse_fs_and_dirstate(
-                            &fs_path,
-                            &fs_metadata,
+                            &FilesystemEntry::of_metadata(directory_fd, base_name_cstring.as_ref(), &fs_metadata),
                             dirstate_node,
                             has_ignored_ancestor,
                         ),
@@ -261,7 +342,7 @@ 
 
         let mut fs_entries = if let Ok(entries) = self.read_dir(
             directory_hg_path,
-            directory_fs_path,
+            directory_fd,
             is_at_repo_root,
         ) {
             entries
@@ -277,7 +358,7 @@ 
         let dirstate_nodes = dirstate_nodes.sorted();
         // `sort_unstable_by_key` doesn’t allow keys borrowing from the value:
         // https://github.com/rust-lang/rust/issues/34162
-        fs_entries.sort_unstable_by(|e1, e2| e1.base_name.cmp(&e2.base_name));
+        fs_entries.sort_unstable_by(|e1, e2| e1.file_name().as_bytes().cmp(&e2.file_name().as_bytes()));
 
         // Propagate here any error that would happen inside the comparison
         // callback below
@@ -293,7 +374,8 @@ 
                 dirstate_node
                     .base_name(self.dmap.on_disk)
                     .unwrap()
-                    .cmp(&fs_entry.base_name)
+                    .as_bytes()
+                    .cmp(&fs_entry.file_name().as_bytes())
             },
         )
         .par_bridge()
@@ -303,8 +385,7 @@ 
             match pair {
                 Both(dirstate_node, fs_entry) => {
                     self.traverse_fs_and_dirstate(
-                        &fs_entry.full_path,
-                        &fs_entry.metadata,
+                        &FilesystemEntry::of_read_dir(directory_fd, &fs_entry),
                         dirstate_node,
                         has_ignored_ancestor,
                     )?;
@@ -318,6 +399,7 @@ 
                     has_dirstate_node_or_is_ignored = self.traverse_fs_only(
                         has_ignored_ancestor,
                         directory_hg_path,
+                        directory_fd,
                         fs_entry,
                     )
                 }
@@ -329,15 +411,18 @@ 
 
     fn traverse_fs_and_dirstate(
         &self,
-        fs_path: &Path,
-        fs_metadata: &std::fs::Metadata,
+        fs_entry: &FilesystemEntry,
         dirstate_node: NodeRef<'tree, 'on_disk>,
         has_ignored_ancestor: bool,
     ) -> Result<(), DirstateV2ParseError> {
         self.check_for_outdated_directory_cache(&dirstate_node)?;
         let hg_path = &dirstate_node.full_path_borrowed(self.dmap.on_disk)?;
-        let file_type = fs_metadata.file_type();
-        let file_or_symlink = file_type.is_file() || file_type.is_symlink();
+        let file_type = fs_entry.file_type();
+        let file_or_symlink =
+           match file_type {
+               openat::SimpleType::File | openat::SimpleType::Symlink => true,
+               openat::SimpleType::Dir | openat::SimpleType::Other => false,
+           };
         if !file_or_symlink {
             // If we previously had a file here, it was removed (with
             // `hg rm` or similar) or deleted before it could be
@@ -347,7 +432,8 @@ 
                 dirstate_node.state()?,
             );
         }
-        if file_type.is_dir() {
+        match fs_entry.as_directory().expect("TODO: handle") {
+        Some ((dir, fs_metadata)) => {
             if self.options.collect_traversed_dirs {
                 self.outcome
                     .lock()
@@ -362,17 +448,18 @@ 
                     is_ignored,
                     dirstate_node.children(self.dmap.on_disk)?,
                     hg_path,
-                    fs_path,
-                    Some(fs_metadata),
+                    &dir,
+                    Some(&fs_metadata),
                     dirstate_node.cached_directory_mtime()?,
                     is_at_repo_root,
                 )?;
             self.maybe_save_directory_mtime(
                 children_all_have_dirstate_node_or_are_ignored,
-                fs_metadata,
+                &fs_metadata,
                 dirstate_node,
             )?
-        } else {
+        },
+        None => {
             if file_or_symlink && self.matcher.matches(hg_path) {
                 if let Some(state) = dirstate_node.state()? {
                     match state {
@@ -394,8 +481,13 @@ 
                             .unwrap()
                             .modified
                             .push(hg_path.detach_from_tree()),
-                        EntryState::Normal => self
-                            .handle_normal_file(&dirstate_node, fs_metadata)?,
+                        EntryState::Normal => {
+                            let fs_metadata = fs_entry.metadata().expect("TODO");
+                            self.handle_normal_file(
+                                &dirstate_node,
+                                fs_metadata.borrow(),
+                            )?
+                        }
                     }
                 } else {
                     // `node.entry.is_none()` indicates a "directory"
@@ -411,23 +503,25 @@ 
             {
                 self.traverse_dirstate_only(child_node)?
             }
-        }
+        },
+        };
         Ok(())
     }
 
+
     fn maybe_save_directory_mtime(
         &self,
         children_all_have_dirstate_node_or_are_ignored: bool,
-        directory_metadata: &std::fs::Metadata,
+        directory_metadata: &openat::Metadata,
         dirstate_node: NodeRef<'tree, 'on_disk>,
     ) -> Result<(), DirstateV2ParseError> {
         if children_all_have_dirstate_node_or_are_ignored {
             // All filesystem directory entries from `read_dir` have a
             // corresponding node in the dirstate, so we can reconstitute the
             // names of those entries without calling `read_dir` again.
-            if let (Some(status_start), Ok(directory_mtime)) = (
+            if let (Some(status_start), directory_mtime) = (
                 &self.filesystem_time_at_status_start,
-                directory_metadata.modified(),
+                stat_mtime(directory_metadata),
             ) {
                 // Although the Rust standard library’s `SystemTime` type
                 // has nanosecond precision, the times reported for a
@@ -495,7 +589,7 @@ 
     fn handle_normal_file(
         &self,
         dirstate_node: &NodeRef<'tree, 'on_disk>,
-        fs_metadata: &std::fs::Metadata,
+        fs_metadata: &openat::Metadata,
     ) -> Result<(), DirstateV2ParseError> {
         // Keep the low 31 bits
         fn truncate_u64(value: u64) -> i32 {
@@ -510,7 +604,7 @@ 
             || self.options.check_exec && entry.mode_changed(fs_metadata);
         let size = entry.size();
         let size_changed = size != truncate_u64(fs_metadata.len());
-        if size >= 0 && size_changed && fs_metadata.file_type().is_symlink() {
+        if size >= 0 && size_changed && fs_metadata.simple_type() == openat::SimpleType::Symlink {
             // issue6456: Size returned may be longer due to encryption
             // on EXT-4 fscrypt. TODO maybe only do it on EXT4?
             self.outcome
@@ -607,12 +701,15 @@ 
         &self,
         has_ignored_ancestor: bool,
         directory_hg_path: &HgPath,
-        fs_entry: &DirEntry,
+        parent_dir : &Dir,
+        fs_entry: &openat::Entry,
     ) -> bool {
-        let hg_path = directory_hg_path.join(&fs_entry.base_name);
-        let file_type = fs_entry.metadata.file_type();
-        let file_or_symlink = file_type.is_file() || file_type.is_symlink();
-        if file_type.is_dir() {
+        let hg_path = directory_hg_path.join(HgPath::new(&fs_entry.file_name ().as_bytes()));
+        let file_type = fs_entry.simple_type().unwrap(); // TODO: fix unwrap
+        let file_or_symlink = match file_type {
+            openat::SimpleType::File | openat::SimpleType::Symlink => true,
+            _ => false };
+        if openat::SimpleType::Dir == file_type {
             let is_ignored =
                 has_ignored_ancestor || (self.ignore_fn)(&hg_path);
             let traverse_children = if is_ignored {
@@ -625,15 +722,17 @@ 
             };
             if traverse_children {
                 let is_at_repo_root = false;
+                let dir = parent_dir.sub_dir(fs_entry.file_name()).expect("TODO");
                 if let Ok(children_fs_entries) = self.read_dir(
                     &hg_path,
-                    &fs_entry.full_path,
+                    &dir,
                     is_at_repo_root,
                 ) {
                     children_fs_entries.par_iter().for_each(|child_fs_entry| {
                         self.traverse_fs_only(
                             is_ignored,
                             &hg_path,
+                            &dir,
                             child_fs_entry,
                         );
                     })
@@ -694,46 +793,34 @@ 
     }
 }
 
-struct DirEntry {
-    base_name: HgPathBuf,
-    full_path: PathBuf,
-    metadata: std::fs::Metadata,
-}
-
-impl DirEntry {
-    /// Returns **unsorted** entries in the given directory, with name and
-    /// metadata.
-    ///
-    /// If a `.hg` sub-directory is encountered:
-    ///
-    /// * At the repository root, ignore that sub-directory
-    /// * Elsewhere, we’re listing the content of a sub-repo. Return an empty
-    ///   list instead.
-    fn read_dir(path: &Path, is_at_repo_root: bool) -> io::Result<Vec<Self>> {
-        let mut results = Vec::new();
-        for entry in path.read_dir()? {
-            let entry = entry?;
-            let metadata = entry.metadata()?;
-            let name = get_bytes_from_os_string(entry.file_name());
-            // FIXME don't do this when cached
-            if name == b".hg" {
-                if is_at_repo_root {
-                    // Skip the repo’s own .hg (might be a symlink)
-                    continue;
-                } else if metadata.is_dir() {
-                    // A .hg sub-directory at another location means a subrepo,
-                    // skip it entirely.
-                    return Ok(Vec::new());
-                }
+/// Returns **unsorted** entries in the given directory, with name and
+/// metadata.
+///
+/// If a `.hg` sub-directory is encountered:
+///
+/// * At the repository root, ignore that sub-directory
+/// * Elsewhere, we’re listing the content of a sub-repo. Return an empty
+///   list instead.
+fn read_dir(dir: &openat::Dir, is_at_repo_root: bool) -> io::Result<Vec<openat::Entry>> {
+    let mut results = Vec::new();
+    for entry in dir.list_dir(".")? {
+        let entry = entry?;
+        let name = entry.file_name();
+        // FIXME don't do this when cached
+        if name.as_bytes() == b".hg" {
+            if is_at_repo_root {
+                // Skip the repo’s own .hg (might be a symlink)
+                continue;
+                // TODO: handle simple_type returning None
+            } else if Some(openat::SimpleType::Dir) == entry.simple_type() {
+                // A .hg sub-directory at another location means a subrepo,
+                // skip it entirely.
+                return Ok(Vec::new());
             }
-            results.push(DirEntry {
-                base_name: name.into(),
-                full_path: entry.path(),
-                metadata,
-            })
         }
-        Ok(results)
+        results.push(entry)
     }
+    Ok(results)
 }
 
 /// Return the `mtime` of a temporary file newly-created in the `.hg` directory
diff --git a/rust/hg-core/src/dirstate/entry.rs b/rust/hg-core/src/dirstate/entry.rs
--- a/rust/hg-core/src/dirstate/entry.rs
+++ b/rust/hg-core/src/dirstate/entry.rs
@@ -2,7 +2,6 @@ 
 use crate::errors::HgError;
 use bitflags::bitflags;
 use std::convert::{TryFrom, TryInto};
-use std::fs;
 use std::io;
 use std::time::{SystemTime, UNIX_EPOCH};
 
@@ -76,18 +75,18 @@ 
         }
     }
 
-    pub fn for_mtime_of(metadata: &fs::Metadata) -> io::Result<Self> {
+    pub fn for_mtime_of(metadata: &openat::Metadata) -> io::Result<Self> {
         #[cfg(unix)]
         {
-            use std::os::unix::fs::MetadataExt;
-            let seconds = metadata.mtime();
-            // i64 -> u32 with value always in the `0 .. NSEC_PER_SEC` range
-            let nanoseconds = metadata.mtime_nsec().try_into().unwrap();
+            let stat = metadata.stat();
+            let seconds = stat.st_mtime as libc::time_t;
+            let nanoseconds = stat.st_mtime_nsec as libc::c_long;
+            let nanoseconds : u32 = nanoseconds.try_into().unwrap();
             Ok(Self::new_truncate(seconds, nanoseconds))
         }
         #[cfg(not(unix))]
         {
-            metadata.modified().map(Self::from)
+            panic!("not supported")
         }
     }
 
@@ -130,10 +129,11 @@ 
 
     pub fn likely_equal_to_mtime_of(
         self,
-        metadata: &fs::Metadata,
+        metadata: &openat::Metadata,
     ) -> io::Result<bool> {
         Ok(self.likely_equal(Self::for_mtime_of(metadata)?))
     }
+
 }
 
 impl From<SystemTime> for TruncatedTimestamp {
@@ -578,12 +578,11 @@ 
     #[cfg(unix)]
     pub fn mode_changed(
         &self,
-        filesystem_metadata: &std::fs::Metadata,
+        filesystem_metadata: &openat::Metadata,
     ) -> bool {
-        use std::os::unix::fs::MetadataExt;
         const EXEC_BIT_MASK: u32 = 0o100;
         let dirstate_exec_bit = (self.mode() as u32) & EXEC_BIT_MASK;
-        let fs_exec_bit = filesystem_metadata.mode() & EXEC_BIT_MASK;
+        let fs_exec_bit = filesystem_metadata.stat().st_mode & EXEC_BIT_MASK;
         dirstate_exec_bit != fs_exec_bit
     }
 
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -34,6 +34,7 @@ 
 memmap2 = {version = "0.4", features = ["stable_deref_trait"]}
 zstd = "0.5.3"
 format-bytes = "0.2.2"
+openat = "0.1.21"
 
 # We don't use the `miniz-oxide` backend to not change rhg benchmarks and until
 # we have a clearer view of which backend is the fastest.
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -1,5 +1,7 @@ 
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
+version = 3
+
 [[package]]
 name = "adler"
 version = "0.2.3"
@@ -390,6 +392,7 @@ 
  "log",
  "memmap2",
  "micro-timer",
+ "openat",
  "pretty_assertions",
  "rand",
  "rand_distr",
@@ -598,6 +601,15 @@ 
 checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
 
 [[package]]
+name = "openat"
+version = "0.1.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95aa7c05907b3ebde2610d602f4ddd992145cc6a84493647c30396f30ba83abe"
+dependencies = [
+ "libc",
+]
+
+[[package]]
 name = "output_vt100"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"