Patchwork D8491: rust-filepatterns: match exact `rootglob`s with a `HashSet`, not in the regex

login
register
mail settings
Submitter phabricator
Date May 6, 2020, 1:11 p.m.
Message ID <differential-rev-PHID-DREV-6sg2t5mm6uolxfrwxbh7-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/46258/
State Superseded
Headers show

Comments

phabricator - May 6, 2020, 1:11 p.m.
Alphare created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  This optimization yields some very interesting results in `rootglob`-heavy
  repositories.
  
  I build a test repository of the following structure:
  
    root
        /<uuid>/build/empty_file
        ... repeat for 4000 entries
  
  and a `.hgignore` containing the corresponding 4000 `rootglob` entries pointing
  to all `build/` folders.
  
  Rust+c `hg status` goes from 350ms down to 110ms.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D8491

AFFECTED FILES
  rust/hg-core/src/filepatterns.rs
  rust/hg-core/src/matchers.rs

CHANGE DETAILS




To: Alphare, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-core/src/matchers.rs b/rust/hg-core/src/matchers.rs
--- a/rust/hg-core/src/matchers.rs
+++ b/rust/hg-core/src/matchers.rs
@@ -24,6 +24,7 @@ 
     PatternSyntax,
 };
 
+use crate::filepatterns::normalize_path_bytes;
 use std::borrow::ToOwned;
 use std::collections::HashSet;
 use std::fmt::{Display, Error, Formatter};
@@ -369,15 +370,32 @@ 
 fn build_regex_match<'a>(
     ignore_patterns: &'a [&'a IgnorePattern],
 ) -> PatternResult<(Vec<u8>, Box<dyn Fn(&HgPath) -> bool + Sync>)> {
-    let regexps: Result<Vec<_>, PatternError> = ignore_patterns
-        .into_iter()
-        .map(|k| build_single_regex(*k))
-        .collect();
-    let regexps = regexps?;
+    let mut regexps = vec![];
+    let mut exact_set = HashSet::new();
+
+    for pattern in ignore_patterns {
+        if let Some(re) = build_single_regex(pattern)? {
+            regexps.push(re);
+        } else {
+            let exact = normalize_path_bytes(&pattern.pattern);
+            exact_set.insert(HgPathBuf::from_bytes(&exact));
+        }
+    }
+
     let full_regex = regexps.join(&b'|');
 
-    let matcher = re_matcher(&full_regex)?;
-    let func = Box::new(move |filename: &HgPath| matcher(filename));
+    // An empty pattern would cause the regex engine to incorrectly match the
+    // (empty) root directory
+    let func = if !(regexps.is_empty()) {
+        let matcher = re_matcher(&full_regex)?;
+        let func = move |filename: &HgPath| {
+            exact_set.contains(filename) || matcher(filename)
+        };
+        Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync>
+    } else {
+        let func = move |filename: &HgPath| exact_set.contains(filename);
+        Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync>
+    };
 
     Ok((full_regex, func))
 }
diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs
--- a/rust/hg-core/src/filepatterns.rs
+++ b/rust/hg-core/src/filepatterns.rs
@@ -271,7 +271,7 @@ 
 /// that don't need to be transformed into a regex.
 pub fn build_single_regex(
     entry: &IgnorePattern,
-) -> Result<Vec<u8>, PatternError> {
+) -> Result<Option<Vec<u8>>, PatternError> {
     let IgnorePattern {
         pattern, syntax, ..
     } = entry;
@@ -288,16 +288,11 @@ 
     if *syntax == PatternSyntax::RootGlob
         && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
     {
-        // The `regex` crate adds `.*` to the start and end of expressions
-        // if there are no anchors, so add the start anchor.
-        let mut escaped = vec![b'^'];
-        escaped.extend(escape_pattern(&pattern));
-        escaped.extend(GLOB_SUFFIX);
-        Ok(escaped)
+        Ok(None)
     } else {
         let mut entry = entry.clone();
         entry.pattern = pattern;
-        Ok(_build_single_regex(&entry))
+        Ok(Some(_build_single_regex(&entry)))
     }
 }
 
@@ -628,7 +623,7 @@ 
                 Path::new("")
             ))
             .unwrap(),
-            br"(?:.*/)?rust/target(?:/|$)".to_vec(),
+            Some(br"(?:.*/)?rust/target(?:/|$)".to_vec()),
         );
     }
 
@@ -641,7 +636,7 @@ 
                 Path::new("")
             ))
             .unwrap(),
-            br"^\.(?:/|$)".to_vec(),
+            None,
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -650,7 +645,7 @@ 
                 Path::new("")
             ))
             .unwrap(),
-            br"^whatever(?:/|$)".to_vec(),
+            None,
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -659,7 +654,7 @@ 
                 Path::new("")
             ))
             .unwrap(),
-            br"^[^/]*\.o(?:/|$)".to_vec(),
+            Some(br"^[^/]*\.o(?:/|$)".to_vec()),
         );
     }
 }