Patchwork D6485: rust-filepatterns: use bytes instead of String

login
register
mail settings
Submitter phabricator
Date June 6, 2019, 3:24 p.m.
Message ID <differential-rev-PHID-DREV-so2qszjxlz3htvaaknv3-req@phab.mercurial-scm.org>
Download mbox | patch
Permalink /patch/40336/
State Superseded
Headers show

Comments

phabricator - June 6, 2019, 3:24 p.m.
Alphare created this revision.
Herald added subscribers: mercurial-devel, kevincox, durin42.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  In my initial patch, I introduced an unnecessary hard constraint on UTF-8
  filenames and patterns which I forgot to remove. Although the performance
   penalty for using String might be negligible, we don't want to break
  compatibility with non-UTF-8 encodings for no reason.
  Moreover, this change allows for a cleaner Rust core API.
  
  This patch introduces a new utils module that is used with this fix.
  
  Finally, PatternError was not put inside the Python module generated by
  Rust, which would have raised a NameError.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6485

AFFECTED FILES
  rust/hg-core/src/filepatterns.rs
  rust/hg-core/src/lib.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/mod.rs
  rust/hg-cpython/src/filepatterns.rs

CHANGE DETAILS




To: Alphare, #hg-reviewers
Cc: durin42, kevincox, mercurial-devel

Patch

diff --git a/rust/hg-cpython/src/filepatterns.rs b/rust/hg-cpython/src/filepatterns.rs
--- a/rust/hg-cpython/src/filepatterns.rs
+++ b/rust/hg-cpython/src/filepatterns.rs
@@ -11,14 +11,10 @@ 
 //! and can be used as replacement for the the pure `filepatterns` Python module.
 //!
 use cpython::{
-    exc, PyDict, PyErr, PyModule, PyResult, PyString, PyTuple, Python,
-    ToPyObject,
+    PyBytes, PyDict, PyModule, PyObject, PyResult, PyTuple, Python, ToPyObject,
 };
-use hg::{build_single_regex, read_pattern_file, PatternTuple};
-use exceptions::{
-    PatternError,
-    PatternFileError,
-};
+use exceptions::{PatternError, PatternFileError};
+use hg::{build_single_regex, read_pattern_file, LineNumber, PatternTuple};
 
 /// Rust does not like functions with different return signatures.
 /// The 3-tuple version is always returned by the hg-core function,
@@ -32,17 +28,22 @@ 
 /// for more details.
 fn read_pattern_file_wrapper(
     py: Python,
-    file_path: String,
+    file_path: PyObject,
     warn: bool,
     source_info: bool,
 ) -> PyResult<PyTuple> {
-    match read_pattern_file(file_path, warn) {
+    match read_pattern_file(file_path.extract::<PyBytes>(py)?.data(py), warn) {
         Ok((patterns, warnings)) => {
             if source_info {
-                return Ok((patterns, warnings).to_py_object(py));
+                let itemgetter = |x: &PatternTuple| {
+                    (PyBytes::new(py, &x.0), x.1, PyBytes::new(py, &x.2))
+                };
+                let results: Vec<(PyBytes, LineNumber, PyBytes)> =
+                    patterns.iter().map(itemgetter).collect();
+                return Ok((results, warnings).to_py_object(py));
             }
-            let itemgetter = |x: &PatternTuple| x.0.to_py_object(py);
-            let results: Vec<PyString> =
+            let itemgetter = |x: &PatternTuple| PyBytes::new(py, &x.0);
+            let results: Vec<PyBytes> =
                 patterns.iter().map(itemgetter).collect();
             Ok((results, warnings).to_py_object(py))
         }
@@ -52,22 +53,16 @@ 
 
 fn build_single_regex_wrapper(
     py: Python,
-    kind: String,
-    pat: String,
-    globsuffix: String,
-) -> PyResult<PyString> {
+    kind: PyObject,
+    pat: PyObject,
+    globsuffix: PyObject,
+) -> PyResult<PyBytes> {
     match build_single_regex(
-        kind.as_ref(),
-        pat.as_bytes(),
-        globsuffix.as_bytes(),
+        kind.extract::<PyBytes>(py)?.data(py),
+        pat.extract::<PyBytes>(py)?.data(py),
+        globsuffix.extract::<PyBytes>(py)?.data(py),
     ) {
-        Ok(regex) => match String::from_utf8(regex) {
-            Ok(regex) => Ok(regex.to_py_object(py)),
-            Err(e) => Err(PyErr::new::<exc::UnicodeDecodeError, _>(
-                py,
-                e.to_string(),
-            )),
-        },
+        Ok(regex) => Ok(PyBytes::new(py, &regex)),
         Err(e) => Err(PatternError::pynew(py, e)),
     }
 }
@@ -88,9 +83,9 @@ 
         py_fn!(
             py,
             build_single_regex_wrapper(
-                kind: String,
-                pat: String,
-                globsuffix: String
+                kind: PyObject,
+                pat: PyObject,
+                globsuffix: PyObject
             )
         ),
     )?;
@@ -100,13 +95,13 @@ 
         py_fn!(
             py,
             read_pattern_file_wrapper(
-                file_path: String,
+                file_path: PyObject,
                 warn: bool,
                 source_info: bool
             )
         ),
     )?;
-
+    m.add(py, "PatternError", py.get_type::<PatternError>())?;
     let sys = PyModule::import(py, "sys")?;
     let sys_modules: PyDict = sys.get(py, "modules")?.extract(py)?;
     sys_modules.set_item(py, dotted_name, &m)?;
diff --git a/rust/hg-core/src/utils/mod.rs b/rust/hg-core/src/utils/mod.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/mod.rs
@@ -0,0 +1,45 @@ 
+pub mod files;
+
+pub fn replace_slice<T>(buf: &mut [T], from: &[T], to: &[T])
+where
+    T: Clone + PartialEq,
+{
+    if buf.len() < from.len() || from.len() != to.len() {
+        return;
+    }
+    for i in 0..=buf.len() - from.len() {
+        if buf[i..].starts_with(from) {
+            buf[i..(i + from.len())].clone_from_slice(to);
+        }
+    }
+}
+
+pub trait SliceExt {
+    fn trim(&self) -> &Self;
+    fn trim_end(&self) -> &Self;
+}
+
+fn is_not_whitespace(c: &u8) -> bool {
+    !(*c as char).is_whitespace()
+}
+
+impl SliceExt for [u8] {
+    fn trim(&self) -> &[u8] {
+        if let Some(first) = self.iter().position(is_not_whitespace) {
+            if let Some(last) = self.iter().rposition(is_not_whitespace) {
+                &self[first..last + 1]
+            } else {
+                unreachable!();
+            }
+        } else {
+            &[]
+        }
+    }
+    fn trim_end(&self) -> &[u8] {
+        if let Some(last) = self.iter().rposition(is_not_whitespace) {
+            &self[..last + 1]
+        } else {
+            &[]
+        }
+    }
+}
diff --git a/rust/hg-core/src/utils/files.rs b/rust/hg-core/src/utils/files.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/files.rs
@@ -0,0 +1,17 @@ 
+use std::path::Path;
+
+pub fn get_path_from_bytes(bytes: &[u8]) -> &Path {
+    let os_str;
+    #[cfg(unix)]
+    {
+        use std::os::unix::ffi::OsStrExt;
+        os_str = std::ffi::OsStr::from_bytes(bytes);
+    }
+    #[cfg(windows)]
+    {
+        use std::os::windows::ffi::OsStrExt;
+        os_str = std::ffi::OsString::from_wide(bytes);
+    }
+
+    Path::new(os_str)
+}
diff --git a/rust/hg-core/src/lib.rs b/rust/hg-core/src/lib.rs
--- a/rust/hg-core/src/lib.rs
+++ b/rust/hg-core/src/lib.rs
@@ -19,6 +19,7 @@ 
     CopyVec, CopyVecEntry, DirstateEntry, DirstateParents, DirstateVec,
 };
 mod filepatterns;
+mod utils;
 
 pub use filepatterns::{
     build_single_regex, read_pattern_file, PatternSyntax, PatternTuple,
diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs
--- a/rust/hg-core/src/filepatterns.rs
+++ b/rust/hg-core/src/filepatterns.rs
@@ -1,9 +1,11 @@ 
 use crate::{LineNumber, PatternError, PatternFileError};
-use regex::Regex;
+use regex::bytes::Regex;
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::Read;
 use std::vec::Vec;
+use utils::files::get_path_from_bytes;
+use utils::{replace_slice, SliceExt};
 
 lazy_static! {
     static ref reescape: Vec<Vec<u8>> = {
@@ -192,11 +194,11 @@ 
 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
 /// that don't need to be transformed into a regex.
 pub fn build_single_regex(
-    kind: &str,
+    kind: &[u8],
     pat: &[u8],
     globsuffix: &[u8],
 ) -> Result<Vec<u8>, PatternError> {
-    let enum_kind = parse_pattern_syntax(kind.as_bytes())?;
+    let enum_kind = parse_pattern_syntax(kind)?;
     if enum_kind == PatternSyntax::RootGlob
         && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
     {
@@ -207,95 +209,97 @@ 
 }
 
 lazy_static! {
-    static ref SYNTAXES: HashMap<&'static str, &'static str> = {
+    static ref SYNTAXES: HashMap<&'static [u8], &'static [u8]> = {
         let mut m = HashMap::new();
 
-        m.insert("re", "relre:");
-        m.insert("regexp", "relre:");
-        m.insert("glob", "relglob:");
-        m.insert("rootglob", "rootglob:");
-        m.insert("include", "include");
-        m.insert("subinclude", "subinclude");
+        m.insert(b"re".as_ref(), b"relre:".as_ref());
+        m.insert(b"regexp".as_ref(), b"relre:".as_ref());
+        m.insert(b"glob".as_ref(), b"relglob:".as_ref());
+        m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
+        m.insert(b"include".as_ref(), b"include".as_ref());
+        m.insert(b"subinclude".as_ref(), b"subinclude".as_ref());
         m
     };
 }
 
-pub type PatternTuple = (String, LineNumber, String);
+pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>);
 type WarningTuple = (String, String);
 
 pub fn parse_pattern_file_contents(
-    lines: &str,
-    file_path: &str,
+    lines: &[u8],
+    file_path: &[u8],
     warn: bool,
 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
     let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
     let mut inputs: Vec<PatternTuple> = vec![];
     let mut warnings: Vec<WarningTuple> = vec![];
 
-    let mut current_syntax = "relre:";
+    let mut current_syntax = b"relre:".as_ref();
 
-    let mut line = String::new();
-    for (line_number, line_str) in lines.split('\n').enumerate() {
+    for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
         let line_number = line_number + 1;
-        line.replace_range(.., line_str);
 
-        if line.contains('#') {
-            if let Some(cap) = comment_regex.captures(line.clone().as_ref()) {
-                line = line[..cap.get(1).unwrap().end()].to_string()
+        if line.contains(&('#' as u8)) {
+            if let Some(cap) = comment_regex.captures(line) {
+                line = &line[..cap.get(1).unwrap().end()]
             }
-            line = line.replace(r"\#", "#");
+            let mut line = line.to_owned();
+            replace_slice(&mut line, br"\#", b"#");
         }
 
         let mut line = line.trim_end();
 
         if line.is_empty() {
             continue;
         }
 
-        if line.starts_with("syntax:") {
-            let syntax = line["syntax:".len()..].trim();
+        if line.starts_with(b"syntax:") {
+            let syntax = line[b"syntax:".len()..].trim();
 
             if let Some(rel_syntax) = SYNTAXES.get(syntax) {
                 current_syntax = rel_syntax;
             } else if warn {
-                warnings.push((file_path.to_string(), syntax.to_string()));
+                warnings.push((
+                    String::from_utf8_lossy(file_path).to_string(),
+                    String::from_utf8_lossy(syntax).to_string(),
+                ));
             }
             continue;
         }
 
-        let mut line_syntax: &str = &current_syntax;
+        let mut line_syntax: &[u8] = &current_syntax;
 
         for (s, rels) in SYNTAXES.iter() {
             if line.starts_with(rels) {
                 line_syntax = rels;
                 line = &line[rels.len()..];
                 break;
-            } else if line.starts_with(&format!("{}:", s)) {
+            } else if line.starts_with(&[s, b":".as_ref()].concat()) {
                 line_syntax = rels;
                 line = &line[s.len() + 1..];
                 break;
             }
         }
 
         inputs.push((
-            format!("{}{}", line_syntax, line),
+            [line_syntax, line].concat(),
             line_number,
-            line.to_string(),
+            line.to_owned(),
         ));
     }
     (inputs, warnings)
 }
 
 pub fn read_pattern_file(
-    file_path: String,
+    file_path: &[u8],
     warn: bool,
 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
-    let mut f = File::open(&file_path)?;
-    let mut contents = String::new();
+    let mut f = File::open(get_path_from_bytes(file_path))?;
+    let mut contents = Vec::new();
 
-    f.read_to_string(&mut contents)?;
+    f.read_to_end(&mut contents)?;
 
-    Ok(parse_pattern_file_contents(&contents, &file_path, warn))
+    Ok(parse_pattern_file_contents(&contents, file_path, warn))
 }
 
 #[cfg(test)]
@@ -328,18 +332,23 @@ 
 
     #[test]
     fn test_parse_pattern_file_contents() {
-        let lines = "syntax: glob\n*.elc";
+        let lines = b"syntax: glob\n*.elc";
 
         assert_eq!(
-            vec![("relglob:*.elc".to_string(), 2, "*.elc".to_string())],
-            parse_pattern_file_contents(lines, "file_path", false).0,
+            vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
+            parse_pattern_file_contents(lines, b"file_path", false).0,
         );
 
-        let lines = "syntax: include\nsyntax: glob";
+        let lines = b"syntax: include\nsyntax: glob";
 
         assert_eq!(
-            parse_pattern_file_contents(lines, "file_path", false).0,
+            parse_pattern_file_contents(lines, b"file_path", false).0,
             vec![]
         );
+        let lines = b"glob:**.o";
+        assert_eq!(
+            parse_pattern_file_contents(lines, b"file_path", false).0,
+            vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())]
+        );
     }
 }