Patchwork D8506: rust-regex: fix issues with regex anchoring and performance

login
register
mail settings
Submitter phabricator
Date May 7, 2020, 10:16 p.m.
Message ID <differential-rev-PHID-DREV-gjlfgryo6yat5v6ot6c2-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/46290/
State Superseded
Headers show

Comments

phabricator - May 7, 2020, 10:16 p.m.
Alphare created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  It turns out that the way I tried to work around `regex`'s behavior difference
  with `re2` and Python's `re` was 1) buggy and 2) much more complicated than
  needed.
  
  This is the first of three patches that fix said issues.
  
  In a few words:
  `regex` adds `.*` on either side of patterns when no start or end anchor is
  present. My previous workaround put `^` or `$` for every pattern, which is
  wrong even without the other 2 bugs on top of it.
  Using `^(?:<patterns>)` right at the end of the `regex` path fixes the issue.
  I've opened an issue to get a build option instead:
  https://github.com/rust-lang/regex/issues/675

REPOSITORY
  rHG Mercurial

BRANCH
  stable

REVISION DETAIL
  https://phab.mercurial-scm.org/D8506

AFFECTED FILES
  rust/hg-core/src/filepatterns.rs
  rust/hg-core/src/matchers.rs

CHANGE DETAILS




To: Alphare, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/rust/hg-core/src/matchers.rs b/rust/hg-core/src/matchers.rs
--- a/rust/hg-core/src/matchers.rs
+++ b/rust/hg-core/src/matchers.rs
@@ -342,7 +342,9 @@ 
 ) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> {
     use std::io::Write;
 
-    let mut escaped_bytes = vec![];
+    // The `regex` crate adds `.*` to the start and end of expressions if there
+    // are no anchors, so add the start anchor.
+    let mut escaped_bytes = vec![b'^', b'(', b'?', b':'];
     for byte in pattern {
         if *byte > 127 {
             write!(escaped_bytes, "\\x{:x}", *byte).unwrap();
@@ -350,6 +352,7 @@ 
             escaped_bytes.push(*byte);
         }
     }
+    escaped_bytes.push(b')');
 
     // Avoid the cost of UTF8 checking
     //
diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs
--- a/rust/hg-core/src/filepatterns.rs
+++ b/rust/hg-core/src/filepatterns.rs
@@ -176,9 +176,15 @@ 
         return vec![];
     }
     match syntax {
-        // The `regex` crate adds `.*` to the start and end of expressions
-        // if there are no anchors, so add them.
-        PatternSyntax::Regexp => [b"^", &pattern[..], b"$"].concat(),
+        PatternSyntax::Regexp => {
+            if pattern.starts_with(b".*") {
+                if pattern.ends_with(b".*") {
+                    return pattern.to_owned();
+                }
+                return [&pattern[..], b".*"].concat();
+            }
+            [b".*", &pattern[..], b".*"].concat()
+        }
         PatternSyntax::RelRegexp => {
             // The `regex` crate accepts `**` while `re2` and Python's `re`
             // do not. Checking for `*` correctly triggers the same error all
@@ -196,15 +202,14 @@ 
         }
         PatternSyntax::RootFiles => {
             let mut res = if pattern == b"." {
-                vec![b'^']
+                vec![]
             } else {
                 // Pattern is a directory name.
-                [b"^", escape_pattern(pattern).as_slice(), b"/"].concat()
+                [escape_pattern(pattern).as_slice(), b"/"].concat()
             };
 
             // Anything after the pattern must be a non-directory.
             res.extend(b"[^/]+$");
-            res.push(b'$');
             res
         }
         PatternSyntax::RelGlob => {
@@ -216,7 +221,7 @@ 
             }
         }
         PatternSyntax::Glob | PatternSyntax::RootGlob => {
-            [b"^", glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
+            [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
         }
         PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(),
     }
@@ -290,8 +295,7 @@ 
     {
         // The `regex` crate adds `.*` to the start and end of expressions
         // if there are no anchors, so add the start anchor.
-        let mut escaped = vec![b'^'];
-        escaped.extend(escape_pattern(&pattern));
+        let mut escaped = escape_pattern(&pattern);
         escaped.extend(GLOB_SUFFIX);
         Ok(escaped)
     } else {
@@ -641,7 +645,7 @@ 
                 Path::new("")
             ))
             .unwrap(),
-            br"^\.(?:/|$)".to_vec(),
+            br"\.(?:/|$)".to_vec(),
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -650,7 +654,7 @@ 
                 Path::new("")
             ))
             .unwrap(),
-            br"^whatever(?:/|$)".to_vec(),
+            br"whatever(?:/|$)".to_vec(),
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -659,7 +663,7 @@ 
                 Path::new("")
             ))
             .unwrap(),
-            br"^[^/]*\.o(?:/|$)".to_vec(),
+            br"[^/]*\.o(?:/|$)".to_vec(),
         );
     }
 }