Patchwork D10005: rust: Rewrite dirstate parsing usin the `bytes-cast` crate

login
register
mail settings
Submitter phabricator
Date Feb. 17, 2021, 12:59 p.m.
Message ID <differential-rev-PHID-DREV-jxy4msrjwwbqyegwzjht-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/48318/
State Superseded
Headers show

Comments

phabricator - Feb. 17, 2021, 12:59 p.m.
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D10005

AFFECTED FILES
  rust/hg-core/src/dirstate.rs
  rust/hg-core/src/dirstate/dirstate_map.rs
  rust/hg-core/src/dirstate/parsers.rs

CHANGE DETAILS




To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-core/src/dirstate/parsers.rs b/rust/hg-core/src/dirstate/parsers.rs
--- a/rust/hg-core/src/dirstate/parsers.rs
+++ b/rust/hg-core/src/dirstate/parsers.rs
@@ -6,13 +6,13 @@ 
 use crate::errors::HgError;
 use crate::utils::hg_path::HgPath;
 use crate::{
-    dirstate::{CopyMap, EntryState, StateMap},
+    dirstate::{CopyMap, EntryState, RawEntry, StateMap},
     DirstateEntry, DirstateParents,
 };
-use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
+use byteorder::{BigEndian, WriteBytesExt};
+use bytes_cast::BytesCast;
 use micro_timer::timed;
 use std::convert::{TryFrom, TryInto};
-use std::io::Cursor;
 use std::time::Duration;
 
 /// Parents are stored in the dirstate as byte hashes.
@@ -21,65 +21,45 @@ 
 const MIN_ENTRY_SIZE: usize = 17;
 
 type ParseResult<'a> = (
-    DirstateParents,
+    &'a DirstateParents,
     Vec<(&'a HgPath, DirstateEntry)>,
     Vec<(&'a HgPath, &'a HgPath)>,
 );
 
 #[timed]
-pub fn parse_dirstate(contents: &[u8]) -> Result<ParseResult, HgError> {
-    if contents.len() < PARENT_SIZE * 2 {
-        return Err(HgError::corrupted("Too little data for dirstate."));
-    }
-    let mut copies = vec![];
-    let mut entries = vec![];
+pub fn parse_dirstate(mut contents: &[u8]) -> Result<ParseResult, HgError> {
+    let mut copies = Vec::new();
+    let mut entries = Vec::new();
 
-    let mut curr_pos = PARENT_SIZE * 2;
-    let parents = DirstateParents {
-        p1: contents[..PARENT_SIZE].try_into().unwrap(),
-        p2: contents[PARENT_SIZE..curr_pos].try_into().unwrap(),
-    };
+    let (parents, rest) = DirstateParents::from_bytes(contents)
+        .map_err(|_| HgError::corrupted("Too little data for dirstate."))?;
+    contents = rest;
+    while !contents.is_empty() {
+        let (raw_entry, rest) = RawEntry::from_bytes(contents)
+            .map_err(|_| HgError::corrupted("Overflow in dirstate."))?;
 
-    while curr_pos < contents.len() {
-        if curr_pos + MIN_ENTRY_SIZE > contents.len() {
-            return Err(HgError::corrupted("Overflow in dirstate."));
-        }
-        let entry_bytes = &contents[curr_pos..];
+        let entry = DirstateEntry {
+            state: EntryState::try_from(raw_entry.state)?,
+            mode: raw_entry.mode.get(),
+            mtime: raw_entry.mtime.get(),
+            size: raw_entry.size.get(),
+        };
+        let (paths, rest) =
+            u8::slice_from_bytes(rest, raw_entry.length.get() as usize)
+                .map_err(|_| HgError::corrupted("Overflow in dirstate."))?;
 
-        let mut cursor = Cursor::new(entry_bytes);
-        // Unwraping errors from `byteorder` as we’ve already checked
-        // `MIN_ENTRY_SIZE` so the input should never be too short.
-        let state = EntryState::try_from(cursor.read_u8().unwrap())?;
-        let mode = cursor.read_i32::<BigEndian>().unwrap();
-        let size = cursor.read_i32::<BigEndian>().unwrap();
-        let mtime = cursor.read_i32::<BigEndian>().unwrap();
-        let path_len = cursor.read_i32::<BigEndian>().unwrap() as usize;
-
-        if path_len > contents.len() - curr_pos {
-            return Err(HgError::corrupted("Overflow in dirstate."));
+        // `paths` is either a single path, or two paths separated by a NULL
+        // byte
+        let mut iter = paths.splitn(2, |&byte| byte == b'\0');
+        let path = HgPath::new(
+            iter.next().expect("splitn always yields at least one item"),
+        );
+        if let Some(copy_source) = iter.next() {
+            copies.push((path, HgPath::new(copy_source)));
         }
 
-        // Slice instead of allocating a Vec needed for `read_exact`
-        let path = &entry_bytes[MIN_ENTRY_SIZE..MIN_ENTRY_SIZE + (path_len)];
-
-        let (path, copy) = match memchr::memchr(0, path) {
-            None => (path, None),
-            Some(i) => (&path[..i], Some(&path[(i + 1)..])),
-        };
-
-        if let Some(copy_path) = copy {
-            copies.push((HgPath::new(path), HgPath::new(copy_path)));
-        };
-        entries.push((
-            HgPath::new(path),
-            DirstateEntry {
-                state,
-                mode,
-                size,
-                mtime,
-            },
-        ));
-        curr_pos = curr_pos + MIN_ENTRY_SIZE + (path_len);
+        entries.push((path, entry));
+        contents = rest;
     }
     Ok((parents, entries, copies))
 }
@@ -374,7 +354,7 @@ 
             .collect();
 
         assert_eq!(
-            (parents, state_map, copymap),
+            (&parents, state_map, copymap),
             (new_parents, new_state_map, new_copy_map)
         )
     }
@@ -452,7 +432,7 @@ 
             .collect();
 
         assert_eq!(
-            (parents, state_map, copymap),
+            (&parents, state_map, copymap),
             (new_parents, new_state_map, new_copy_map)
         )
     }
@@ -499,7 +479,7 @@ 
 
         assert_eq!(
             (
-                parents,
+                &parents,
                 [(
                     HgPathBuf::from_bytes(b"f1"),
                     DirstateEntry {
diff --git a/rust/hg-core/src/dirstate/dirstate_map.rs b/rust/hg-core/src/dirstate/dirstate_map.rs
--- a/rust/hg-core/src/dirstate/dirstate_map.rs
+++ b/rust/hg-core/src/dirstate/dirstate_map.rs
@@ -386,10 +386,10 @@ 
     }
 
     #[timed]
-    pub fn read(
+    pub fn read<'a>(
         &mut self,
-        file_contents: &[u8],
-    ) -> Result<Option<DirstateParents>, DirstateError> {
+        file_contents: &'a [u8],
+    ) -> Result<Option<&'a DirstateParents>, DirstateError> {
         if file_contents.is_empty() {
             return Ok(None);
         }
diff --git a/rust/hg-core/src/dirstate.rs b/rust/hg-core/src/dirstate.rs
--- a/rust/hg-core/src/dirstate.rs
+++ b/rust/hg-core/src/dirstate.rs
@@ -7,6 +7,7 @@ 
 
 use crate::errors::HgError;
 use crate::{utils::hg_path::HgPathBuf, FastHashMap};
+use bytes_cast::{unaligned, BytesCast};
 use std::collections::hash_map;
 use std::convert::TryFrom;
 
@@ -17,7 +18,8 @@ 
 pub mod parsers;
 pub mod status;
 
-#[derive(Debug, PartialEq, Clone)]
+#[derive(Debug, PartialEq, Clone, BytesCast)]
+#[repr(C)]
 pub struct DirstateParents {
     pub p1: [u8; 20],
     pub p2: [u8; 20],
@@ -34,6 +36,16 @@ 
     pub size: i32,
 }
 
+#[derive(BytesCast)]
+#[repr(C)]
+struct RawEntry {
+    state: u8,
+    mode: unaligned::I32Be,
+    size: unaligned::I32Be,
+    mtime: unaligned::I32Be,
+    length: unaligned::I32Be,
+}
+
 /// A `DirstateEntry` with a size of `-2` means that it was merged from the
 /// other parent. This allows revert to pick the right status back during a
 /// merge.