Patchwork D11965: rhg: desambiguate status without decompressing filelog if possible

login
register
mail settings
Submitter phabricator
Date Jan. 6, 2022, 6:58 p.m.
Message ID <differential-rev-PHID-DREV-rag3fcp4fbppiwus754b-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/50291/
State New
Headers show

Comments

phabricator - Jan. 6, 2022, 6:58 p.m.
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  When status is unsure based on `stat()` and the dirstate if a file is clean
  or modified, we need to compare it against the filelog.
  
  This comparison can skip looking at contents if the lengths differ.
  This changeset optimize this further to deduce what we can about the length
  if the filelog without decompressing it or resolving deltas.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11965

AFFECTED FILES
  mercurial/revlogutils/flagutil.py
  rust/hg-core/src/revlog/filelog.rs
  rust/hg-core/src/revlog/index.rs
  rust/hg-core/src/revlog/revlog.rs
  rust/rhg/src/commands/status.rs

CHANGE DETAILS




To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/rhg/src/commands/status.rs b/rust/rhg/src/commands/status.rs
--- a/rust/rhg/src/commands/status.rs
+++ b/rust/rhg/src/commands/status.rs
@@ -516,16 +516,16 @@ 
         filelog.entry_for_node(entry.node_id()?).map_err(|_| {
             HgError::corrupted("filelog missing node from manifest")
         })?;
-    // TODO: check `fs_len` here like below, but based on
-    // `RevlogEntry::uncompressed_len` without decompressing the full filelog
-    // contents where possible. This is only valid if the revlog data does not
-    // contain metadata. See how Python’s `revlog.rawsize` calls
-    // `storageutil.filerevisioncopied`.
-    // (Maybe also check for content-modifying flags? See `revlog.size`.)
-    let filelog_data = filelog_entry.data()?;
-    let contents_in_p1 = filelog_data.file_data()?;
-    if contents_in_p1.len() as u64 != fs_len {
-        // No need to read the file contents:
+    if filelog_entry.file_data_len_not_equal_to(fs_len) {
+        // No need to read file contents:
+        // it cannot be equal if it has a different length.
+        return Ok(true);
+    }
+
+    let p1_filelog_data = filelog_entry.data()?;
+    let p1_contents = p1_filelog_data.file_data()?;
+    if p1_contents.len() as u64 != fs_len {
+        // No need to read file contents:
         // it cannot be equal if it has a different length.
         return Ok(true);
     }
@@ -535,5 +535,5 @@ 
     } else {
         vfs.read(fs_path)?
     };
-    Ok(contents_in_p1 != &*fs_contents)
+    Ok(p1_contents != &*fs_contents)
 }
diff --git a/rust/hg-core/src/revlog/revlog.rs b/rust/hg-core/src/revlog/revlog.rs
--- a/rust/hg-core/src/revlog/revlog.rs
+++ b/rust/hg-core/src/revlog/revlog.rs
@@ -20,6 +20,18 @@ 
 use crate::revlog::Revision;
 use crate::{Node, NULL_REVISION};
 
+const REVISION_FLAG_CENSORED: u16 = 1 << 15;
+const REVISION_FLAG_ELLIPSIS: u16 = 1 << 14;
+const REVISION_FLAG_EXTSTORED: u16 = 1 << 13;
+const REVISION_FLAG_HASCOPIESINFO: u16 = 1 << 12;
+
+// Keep this in sync with REVIDX_KNOWN_FLAGS in
+// mercurial/revlogutils/flagutil.py
+const REVIDX_KNOWN_FLAGS: u16 = REVISION_FLAG_CENSORED
+    | REVISION_FLAG_ELLIPSIS
+    | REVISION_FLAG_EXTSTORED
+    | REVISION_FLAG_HASCOPIESINFO;
+
 #[derive(derive_more::From)]
 pub enum RevlogError {
     InvalidRevision,
@@ -282,6 +294,7 @@ 
             },
             p1: index_entry.p1(),
             p2: index_entry.p2(),
+            flags: index_entry.flags(),
             hash: *index_entry.hash(),
         };
         Ok(entry)
@@ -309,6 +322,7 @@ 
     base_rev_or_base_of_delta_chain: Option<Revision>,
     p1: Revision,
     p2: Revision,
+    flags: u16,
     hash: Node,
 }
 
@@ -321,6 +335,20 @@ 
         u32::try_from(self.uncompressed_len).ok()
     }
 
+    pub fn has_p1(&self) -> bool {
+        self.p1 != NULL_REVISION
+    }
+
+    pub fn is_cencored(&self) -> bool {
+        (self.flags & REVISION_FLAG_CENSORED) != 0
+    }
+
+    pub fn has_length_affecting_flag_processor(&self) -> bool {
+        // Relevant Python code: revlog.size()
+        // note: ELLIPSIS is known to not change the content
+        (self.flags & (REVIDX_KNOWN_FLAGS ^ REVISION_FLAG_ELLIPSIS)) != 0
+    }
+
     /// The data for this entry, after resolving deltas if any.
     pub fn data(&self) -> Result<Cow<'a, [u8]>, HgError> {
         let mut entry = self.clone();
diff --git a/rust/hg-core/src/revlog/index.rs b/rust/hg-core/src/revlog/index.rs
--- a/rust/hg-core/src/revlog/index.rs
+++ b/rust/hg-core/src/revlog/index.rs
@@ -260,6 +260,10 @@ 
         }
     }
 
+    pub fn flags(&self) -> u16 {
+        BigEndian::read_u16(&self.bytes[6..=7])
+    }
+
     /// Return the compressed length of the data.
     pub fn compressed_len(&self) -> u32 {
         BigEndian::read_u32(&self.bytes[8..=11])
diff --git a/rust/hg-core/src/revlog/filelog.rs b/rust/hg-core/src/revlog/filelog.rs
--- a/rust/hg-core/src/revlog/filelog.rs
+++ b/rust/hg-core/src/revlog/filelog.rs
@@ -73,6 +73,89 @@ 
 pub struct FilelogEntry<'a>(RevlogEntry<'a>);
 
 impl FilelogEntry<'_> {
+    /// `self.data()` can be expensive, with decompression and delta
+    /// resolution.
+    ///
+    /// *Without* paying this cost, based on revlog index information
+    /// including `RevlogEntry::uncompressed_len`:
+    ///
+    /// * Returns `true` if the length that `self.data().file_data().len()`
+    ///   would return is definitely **not equal** to `other_len`.
+    /// * Returns `false` if available information is inconclusive.
+    pub fn file_data_len_not_equal_to(&self, other_len: u64) -> bool {
+        // Relevant code that implement this behavior in Python code:
+        // basefilectx.cmp, filelog.size, storageutil.filerevisioncopied,
+        // revlog.size, revlog.rawsize
+
+        // Let’s call `file_data_len` what would be returned by
+        // `self.data().file_data().len()`.
+
+        if self.0.is_cencored() {
+            let file_data_len = 0;
+            return other_len != file_data_len;
+        }
+
+        if self.0.has_length_affecting_flag_processor() {
+            // We can’t conclude anything about `file_data_len`.
+            return false;
+        }
+
+        // Revlog revisions (usually) have metadata for the size of
+        // their data after decompression and delta resolution
+        // as would be returned by `Revlog::get_rev_data`.
+        //
+        // For filelogs this is the file’s contents preceded by an optional
+        // metadata block.
+        let uncompressed_len = if let Some(l) = self.0.uncompressed_len() {
+            l as u64
+        } else {
+            // The field was set to -1, the actual uncompressed len is unknown.
+            // We need to decompress to say more.
+            return false;
+        };
+        // `uncompressed_len = file_data_len + optional_metadata_len`,
+        // so `file_data_len <= uncompressed_len`.
+        if uncompressed_len < other_len {
+            // Transitively, `file_data_len < other_len`.
+            // So `other_len != file_data_len` definitely.
+            return true;
+        }
+
+        if uncompressed_len == other_len + 4 {
+            // It’s possible that `file_data_len == other_len` with an empty
+            // metadata block (2 start marker bytes + 2 end marker bytes).
+            // This happens when there wouldn’t otherwise be metadata, but
+            // the first 2 bytes of file data happen to match a start marker
+            // and would be ambiguous.
+            return false;
+        }
+
+        if !self.0.has_p1() {
+            // There may or may not be copy metadata, so we can’t deduce more
+            // about `file_data_len` without computing file data.
+            return false;
+        }
+
+        // Filelog ancestry is not meaningful in the way changelog ancestry is.
+        // It only provides hints to delta generation.
+        // p1 and p2 are set to null when making a copy or rename since
+        // contents are likely unrelatedto what might have previously existed
+        // at the destination path.
+        //
+        // Conversely, since here p1 is non-null, there is no copy metadata.
+        // Note that this reasoning may be invalidated in the presence of
+        // merges made by some previous versions of Mercurial that
+        // swapped p1 and p2. See <https://bz.mercurial-scm.org/show_bug.cgi?id=6528>
+        // and `tests/test-issue6528.t`.
+        //
+        // Since copy metadata is currently the only kind of metadata
+        // kept in revlog data of filelogs,
+        // this `FilelogEntry` does not have such metadata:
+        let file_data_len = uncompressed_len;
+
+        return file_data_len != other_len;
+    }
+
     pub fn data(&self) -> Result<FilelogRevisionData, HgError> {
         Ok(FilelogRevisionData(self.0.data()?.into_owned()))
     }
diff --git a/mercurial/revlogutils/flagutil.py b/mercurial/revlogutils/flagutil.py
--- a/mercurial/revlogutils/flagutil.py
+++ b/mercurial/revlogutils/flagutil.py
@@ -32,6 +32,7 @@ 
 REVIDX_FLAGS_ORDER
 REVIDX_RAWTEXT_CHANGING_FLAGS
 
+# Keep this in sync with REVIDX_KNOWN_FLAGS in rust/hg-core/src/revlog/revlog.rs
 REVIDX_KNOWN_FLAGS = util.bitsfrom(REVIDX_FLAGS_ORDER)
 
 # Store flag processors (cf. 'addflagprocessor()' to register)