Patchwork D11389: rhg: Port Python’s `ui.configlist` as `Config::get_list`

login
register
mail settings
Submitter phabricator
Date Sept. 3, 2021, 2:38 p.m.
Message ID <differential-rev-PHID-DREV-pofuppwucplgrq5n2irg-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/49706/
State Superseded
Headers show

Comments

phabricator - Sept. 3, 2021, 2:38 p.m.
SimonSapin created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  This new method is not used yet outside of its own unit tests,
  so this changeset should make no observable change.
  
  The Rust parser implementation attempts to exactly replicate the behavior of
  the Python one, even in edge cases where that behavior is… surprising.
  New unit tests capture some of these edge cases.
  
  This started as a line-by-line port. The main changes are:
  
  - Pass around a parser mode enum instead of parser functions
  - Inline the whole parser into one function
  - Use `[u8]::get` which returns an `Option`, instead of indexing  after explicitly checking the length.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11389

AFFECTED FILES
  rust/hg-core/src/config/config.rs
  rust/hg-core/src/config/values.rs
  tests/test-config-parselist.t

CHANGE DETAILS




To: SimonSapin, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/tests/test-config-parselist.t b/tests/test-config-parselist.t
new file mode 100644
--- /dev/null
+++ b/tests/test-config-parselist.t
@@ -0,0 +1,76 @@ 
+List-valued configuration keys have an ad-hoc microsyntax. From `hg help config`:
+
+> List values are separated by whitespace or comma, except when values are
+> placed in double quotation marks:
+>
+>     allow_read = "John Doe, PhD", brian, betty
+>
+> Quotation marks can be escaped by prefixing them with a backslash. Only
+> quotation marks at the beginning of a word is counted as a quotation
+> (e.g., ``foo"bar baz`` is the list of ``foo"bar`` and ``baz``).
+
+That help documentation is fairly light on details, the actual parser has many 
+other edge cases. This test tries to cover them.
+
+
+Somehow, one backslash-escaping level seems to be removed
+between >>> blocks here and the Python parser:
+
+  >>> print(b'\\"'.decode('ascii'))
+  "
+  >>> print(b'\\\\"'.decode('ascii'))
+  \"
+  >>> print(rb'\\"'.decode('ascii'))
+  \"
+
+
+Keep these Python tests in sync with the Rust ones in `rust/hg-core/src/config/values.rs`
+
+  >>> from mercurial.utils.stringutil import parselist
+  >>> for input in [
+  ...    b'',
+  ...    b',',
+  ...    b'A',
+  ...    b'B,B',
+  ...    b', C, ,C,',
+  ...    b'"',
+  ...    b'""',
+  ...    b'D,"',
+  ...    b'E,""',
+  ...    b'"F,F"',
+  ...    b'"G,G',
+  ...    b'"H \\\\",\\\\"H',
+  ...    b'I,I"',
+  ...    b'J,"J',
+  ...    b'K K',
+  ...    b'"K" K',
+  ...    b'L\\tL',
+  ...    b'"L"\\tL',
+  ...    b'M\\x0bM',
+  ...    b'"M"\\x0bM',
+  ...    b'"N"  , ,"',
+  ...    b'" ,O,  ',
+  ... ]:
+  ...     print(repr(parselist(input)))
+  []
+  []
+  [b'A']
+  [b'B', b'B']
+  [b'C', b'C']
+  [b'"']
+  [b'', b'']
+  [b'D', b'"']
+  [b'E', b'', b'']
+  [b'F,F']
+  [b'"G', b'G']
+  [b'"H', b',', b'H']
+  [b'I', b'I"']
+  [b'J', b'"J']
+  [b'K', b'K']
+  [b'K', b'K']
+  [b'L', b'L']
+  [b'L', b'', b'L']
+  [b'M', b'M']
+  [b'M', b'', b'M']
+  [b'N"']
+  [b'"', b'O']
diff --git a/rust/hg-core/src/config/values.rs b/rust/hg-core/src/config/values.rs
--- a/rust/hg-core/src/config/values.rs
+++ b/rust/hg-core/src/config/values.rs
@@ -8,6 +8,8 @@ 
 //! details about where the value came from (but omits details of what’s
 //! invalid inside the value).
 
+use crate::utils::SliceExt;
+
 pub(super) fn parse_bool(v: &[u8]) -> Option<bool> {
     match v.to_ascii_lowercase().as_slice() {
         b"1" | b"yes" | b"true" | b"on" | b"always" => Some(true),
@@ -42,6 +44,216 @@ 
     value.parse().ok()
 }
 
+/// Parse a config value as a list of sub-values.
+///
+/// Ported from `parselist` in `mercurial/utils/stringutil.py`
+
+// Note: keep behavior in sync with the Python one.
+
+// Note: this could return `Vec<Cow<[u8]>>` instead and borrow `input` when
+// possible (when there’s no backslash-escapes) but this is probably not worth
+// the complexity as config is presumably not accessed inside
+// preformance-sensitive loops.
+pub(super) fn parse_list(input: &[u8]) -> Vec<Vec<u8>> {
+    // Port of Python’s `value.lstrip(b' ,\n')`
+    // TODO: is this really what we want?
+    let input =
+        input.trim_start_matches(|b| b == b' ' || b == b',' || b == b'\n');
+    parse_list_without_trim_start(input)
+}
+
+fn parse_list_without_trim_start(input: &[u8]) -> Vec<Vec<u8>> {
+    // Start of port of Python’s `_configlist`
+    let input = input.trim_end_matches(|b| b == b' ' || b == b',');
+    if input.is_empty() {
+        return Vec::new();
+    }
+
+    // Just to make “a string” less confusable with “a list of strings”.
+    type ByteString = Vec<u8>;
+
+    // These correspond to Python’s…
+    let mut mode = ParserMode::Plain; // `parser`
+    let mut values = Vec::new(); // `parts[:-1]`
+    let mut next_value = ByteString::new(); // `parts[-1]`
+    let mut offset = 0; // `offset`
+
+    // Setting `parser` to `None` is instead handled by returning immediately
+    enum ParserMode {
+        Plain,
+        Quoted,
+    }
+
+    loop {
+        match mode {
+            ParserMode::Plain => {
+                // Start of port of Python’s `_parse_plain`
+                let mut whitespace = false;
+                while let Some(&byte) = input.get(offset) {
+                    if is_space(byte) || byte == b',' {
+                        whitespace = true;
+                        offset += 1;
+                    } else {
+                        break;
+                    }
+                }
+                if let Some(&byte) = input.get(offset) {
+                    if whitespace {
+                        values.push(std::mem::take(&mut next_value))
+                    }
+                    if byte == b'"' && next_value.is_empty() {
+                        mode = ParserMode::Quoted;
+                    } else {
+                        if byte == b'"' && next_value.ends_with(b"\\") {
+                            next_value.pop();
+                        }
+                        next_value.push(byte);
+                    }
+                    offset += 1;
+                } else {
+                    values.push(next_value);
+                    return values;
+                }
+            }
+            ParserMode::Quoted => {
+                // Start of port of Python’s `_parse_quote`
+                if let Some(&byte) = input.get(offset) {
+                    if byte == b'"' {
+                        // The input contains a quoted zero-length value `""`
+                        debug_assert_eq!(next_value, b"");
+                        values.push(std::mem::take(&mut next_value));
+                        offset += 1;
+                        while let Some(&byte) = input.get(offset) {
+                            if is_space(byte) || byte == b',' {
+                                offset += 1;
+                            } else {
+                                break;
+                            }
+                        }
+                        mode = ParserMode::Plain;
+                        continue;
+                    }
+                }
+
+                while let Some(&byte) = input.get(offset) {
+                    if byte == b'"' {
+                        break;
+                    }
+                    if byte == b'\\' && input.get(offset + 1) == Some(&b'"') {
+                        next_value.push(b'"');
+                        offset += 2;
+                    } else {
+                        next_value.push(byte);
+                        offset += 1;
+                    }
+                }
+
+                if offset >= input.len() {
+                    // We didn’t find a closing double-quote,
+                    // so treat the opening one as part of an unquoted value
+                    // instead of delimiting the start of a quoted value.
+
+                    // `next_value` may have had some backslash-escapes
+                    // unescaped. TODO: shouldn’t we use a slice of `input`
+                    // instead?
+                    let mut real_values =
+                        parse_list_without_trim_start(&next_value);
+
+                    if let Some(first) = real_values.first_mut() {
+                        first.insert(0, b'"');
+                        // Drop `next_value`
+                        values.extend(real_values)
+                    } else {
+                        next_value.push(b'"');
+                        values.push(next_value);
+                    }
+                    return values;
+                }
+
+                // We’re not at the end of the input, which means the `while`
+                // loop above ended at at double quote. Skip
+                // over that.
+                offset += 1;
+
+                while let Some(&byte) = input.get(offset) {
+                    if byte == b' ' || byte == b',' {
+                        offset += 1;
+                    } else {
+                        break;
+                    }
+                }
+
+                if offset >= input.len() {
+                    values.push(next_value);
+                    return values;
+                }
+
+                if offset + 1 == input.len() && input[offset] == b'"' {
+                    next_value.push(b'"');
+                    offset += 1;
+                } else {
+                    values.push(std::mem::take(&mut next_value));
+                }
+
+                mode = ParserMode::Plain;
+            }
+        }
+    }
+
+    // https://docs.python.org/3/library/stdtypes.html?#bytes.isspace
+    fn is_space(byte: u8) -> bool {
+        if let b' ' | b'\t' | b'\n' | b'\r' | b'\x0b' | b'\x0c' = byte {
+            true
+        } else {
+            false
+        }
+    }
+}
+
+#[test]
+fn test_parse_list() {
+    // Make `assert_eq` error messages nicer
+    fn as_strings(values: &[Vec<u8>]) -> Vec<String> {
+        values
+            .iter()
+            .map(|v| std::str::from_utf8(v.as_ref()).unwrap().to_owned())
+            .collect()
+    }
+    macro_rules! assert_parse_list {
+        ( $input: expr => [ $( $output: expr ),* ] ) => {
+            assert_eq!(
+                as_strings(&parse_list($input)),
+                as_strings(&[ $( Vec::from(&$output[..]) ),* ]),
+            );
+        }
+    }
+
+    // Keep these Rust tests in sync with the Python ones in
+    // `tests/test-config-parselist.t`
+    assert_parse_list!(b"" => []);
+    assert_parse_list!(b"," => []);
+    assert_parse_list!(b"A" => [b"A"]);
+    assert_parse_list!(b"B,B" => [b"B", b"B"]);
+    assert_parse_list!(b", C, ,C," => [b"C", b"C"]);
+    assert_parse_list!(b"\"" => [b"\""]);
+    assert_parse_list!(b"\"\"" => [b"", b""]);
+    assert_parse_list!(b"D,\"" => [b"D", b"\""]);
+    assert_parse_list!(b"E,\"\"" => [b"E", b"", b""]);
+    assert_parse_list!(b"\"F,F\"" => [b"F,F"]);
+    assert_parse_list!(b"\"G,G" => [b"\"G", b"G"]);
+    assert_parse_list!(b"\"H \\\",\\\"H" => [b"\"H", b",", b"H"]);
+    assert_parse_list!(b"I,I\"" => [b"I", b"I\""]);
+    assert_parse_list!(b"J,\"J" => [b"J", b"\"J"]);
+    assert_parse_list!(b"K K" => [b"K", b"K"]);
+    assert_parse_list!(b"\"K\" K" => [b"K", b"K"]);
+    assert_parse_list!(b"L\tL" => [b"L", b"L"]);
+    assert_parse_list!(b"\"L\"\tL" => [b"L", b"", b"L"]);
+    assert_parse_list!(b"M\x0bM" => [b"M", b"M"]);
+    assert_parse_list!(b"\"M\"\x0bM" => [b"M", b"", b"M"]);
+    assert_parse_list!(b"\"N\"  , ,\"" => [b"N\""]);
+    assert_parse_list!(b"\" ,O,  " => [b"\"", b"O"]);
+}
+
 #[test]
 fn test_parse_byte_size() {
     assert_eq!(parse_byte_size(b""), None);
diff --git a/rust/hg-core/src/config/config.rs b/rust/hg-core/src/config/config.rs
--- a/rust/hg-core/src/config/config.rs
+++ b/rust/hg-core/src/config/config.rs
@@ -388,6 +388,16 @@ 
         })
     }
 
+    /// If there is an `item` value in `section`, parse and return a list of
+    /// byte strings.
+    pub fn get_list(
+        &self,
+        section: &[u8],
+        item: &[u8],
+    ) -> Option<Vec<Vec<u8>>> {
+        self.get(section, item).map(values::parse_list)
+    }
+
     /// Returns the raw value bytes of the first one found, or `None`.
     pub fn get(&self, section: &[u8], item: &[u8]) -> Option<&[u8]> {
         self.get_inner(section, item)