Patchwork D11518: dirstate-v2: Initial Python parser

login
register
mail settings
Submitter phabricator
Date Oct. 1, 2021, 7:18 a.m.
Message ID <differential-rev-PHID-DREV-4tropwwnfsy5uwk4plqv-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/49843/
State Superseded
Headers show

Comments

phabricator - Oct. 1, 2021, 7:18 a.m.
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  The dirstate-v2 file format should be supported even if Rust extensions are
  not enabled. This changeset adds parsing code that is not used yet.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11518

AFFECTED FILES
  mercurial/dirstatemap.py
  mercurial/dirstateutils/docket.py
  mercurial/dirstateutils/v2.py
  rust/hg-core/src/dirstate_tree/on_disk.rs

CHANGE DETAILS




To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-core/src/dirstate_tree/on_disk.rs b/rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs
@@ -47,16 +47,16 @@ 
 pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
 pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
 
-/// Must match the constant of the same name in
-/// `mercurial/dirstateutils/docket.py`
+/// Must match constants of the same names in `mercurial/dirstateutils/v2.py`
 const TREE_METADATA_SIZE: usize = 44;
+const NODE_SIZE: usize = 43;
 
 /// Make sure that size-affecting changes are made knowingly
 #[allow(unused)]
 fn static_assert_size_of() {
     let _ = std::mem::transmute::<TreeMetadata, [u8; TREE_METADATA_SIZE]>;
     let _ = std::mem::transmute::<DocketHeader, [u8; TREE_METADATA_SIZE + 81]>;
-    let _ = std::mem::transmute::<Node, [u8; 43]>;
+    let _ = std::mem::transmute::<Node, [u8; NODE_SIZE]>;
 }
 
 // Must match `HEADER` in `mercurial/dirstateutils/docket.py`
@@ -169,8 +169,8 @@ 
 #[repr(C)]
 struct Entry {
     mode: I32Be,
+    size: I32Be,
     mtime: I32Be,
-    size: I32Be,
 }
 
 /// Duration since the Unix epoch
diff --git a/mercurial/dirstateutils/v2.py b/mercurial/dirstateutils/v2.py
new file mode 100644
--- /dev/null
+++ b/mercurial/dirstateutils/v2.py
@@ -0,0 +1,106 @@ 
+# v2.py - Pure-Python implementation of the dirstate-v2 file format
+#
+# Copyright Mercurial Contributors
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+from __future__ import absolute_import
+
+import struct
+
+from .. import policy
+
+parsers = policy.importmod('parsers')
+DirstateItem = parsers.DirstateItem
+
+
+# Must match the constant of the same name in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`
+TREE_METADATA_SIZE = 44
+NODE_SIZE = 43
+
+
+# Must match the `TreeMetadata` Rust struct in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`. See doc-comments there.
+#
+# * 4 bytes: start offset of root nodes
+# * 4 bytes: number of root nodes
+# * 4 bytes: total number of nodes in the tree that have an entry
+# * 4 bytes: total number of nodes in the tree that have a copy source
+# * 4 bytes: number of bytes in the data file that are not used anymore
+# * 4 bytes: unused
+# * 20 bytes: SHA-1 hash of ignore patterns
+TREE_METADATA = struct.Struct('>LLLLL4s20s')
+
+
+# Must match the `Node` Rust struct in
+# `rust/hg-core/src/dirstate_tree/on_disk.rs`. See doc-comments there.
+#
+# * 4 bytes: start offset of full path
+# * 2 bytes: length of the full path
+# * 2 bytes: length within the full path before its "base name"
+# * 4 bytes: start offset of the copy source if any, or zero for no copy source
+# * 2 bytes: length of the copy source if any, or unused
+# * 4 bytes: start offset of child nodes
+# * 4 bytes: number of child nodes
+# * 4 bytes: number of descendant nodes that have an entry
+# * 4 bytes: number of descendant nodes that have a "tracked" state
+# * 1 byte: state
+# * 4 bytes: entry mode
+# * 4 bytes: entry size
+# * 4 bytes: entry mtime
+NODE = struct.Struct('>LHHLHLLLLclll')
+
+
+assert TREE_METADATA_SIZE == TREE_METADATA.size
+assert NODE_SIZE == NODE.size
+
+
+def parse_dirstate(map, copy_map, data, tree_metadata):
+    (
+        root_nodes_start,
+        root_nodes_len,
+        _nodes_with_entry_count,
+        _nodes_with_copy_source_count,
+        _unreachable_bytes,
+        _unused,
+        _ignore_patterns_hash,
+    ) = TREE_METADATA.unpack(tree_metadata)
+    parse_nodes(map, copy_map, data, root_nodes_start, root_nodes_len)
+
+
+def parse_nodes(map, copy_map, data, start, len):
+    for i in range(len):
+        node_start = start + NODE_SIZE * i
+        node_bytes = slice_with_len(data, node_start, NODE_SIZE)
+        (
+            path_start,
+            path_len,
+            _basename_strat,
+            copy_source_start,
+            copy_source_len,
+            children_start,
+            children_count,
+            _descendants_with_entry_count,
+            _tracked_descendants_count,
+            state,
+            mode,
+            size,
+            mtime,
+        ) = NODE.unpack(node_bytes)
+        # Recurse
+        parse_nodes(map, copy_map, data, children_start, children_count)
+
+        if state not in b'narm':
+            continue
+        path = slice_with_len(data, path_start, path_len)
+        map[path] = DirstateItem.from_v1_data(state, mode, size, mtime)
+        if copy_source_start:
+            copy_map[path] = slice_with_len(
+                data, copy_source_start, copy_source_len
+            )
+
+
+def slice_with_len(data, start, len):
+    return data[start : start + len]
diff --git a/mercurial/dirstateutils/docket.py b/mercurial/dirstateutils/docket.py
--- a/mercurial/dirstateutils/docket.py
+++ b/mercurial/dirstateutils/docket.py
@@ -10,14 +10,10 @@ 
 import struct
 
 from ..revlogutils import docket as docket_mod
-
+from . import v2
 
 V2_FORMAT_MARKER = b"dirstate-v2\n"
 
-# Must match the constant of the same name in
-# `rust/hg-core/src/dirstate_tree/on_disk.rs`
-TREE_METADATA_SIZE = 44
-
 # * 12 bytes: format marker
 # * 32 bytes: node ID of the working directory's first parent
 # * 32 bytes: node ID of the working directory's second parent
@@ -29,7 +25,7 @@ 
 # Node IDs are null-padded if shorter than 32 bytes.
 # A data file shorter than the specified used size is corrupted (truncated)
 HEADER = struct.Struct(
-    ">{}s32s32sL{}sB".format(len(V2_FORMAT_MARKER), TREE_METADATA_SIZE)
+    ">{}s32s32sL{}sB".format(len(V2_FORMAT_MARKER), v2.TREE_METADATA_SIZE)
 )
 
 
diff --git a/mercurial/dirstatemap.py b/mercurial/dirstatemap.py
--- a/mercurial/dirstatemap.py
+++ b/mercurial/dirstatemap.py
@@ -20,6 +20,7 @@ 
 
 from .dirstateutils import (
     docket as docketmod,
+    v2,
 )
 
 parsers = policy.importmod('parsers')