Patchwork [3,of,3] fileset: combine union of basic patterns into single matcher

login
register
mail settings
Submitter Yuya Nishihara
Date Aug. 4, 2018, 12:14 p.m.
Message ID <33247ffb2c71b954e229.1533384892@mimosa>
Download mbox | patch
Permalink /patch/33237/
State New
Headers show

Comments

Yuya Nishihara - Aug. 4, 2018, 12:14 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1532161152 -32400
#      Sat Jul 21 17:19:12 2018 +0900
# Node ID 33247ffb2c71b954e229da8424798d52c1e0d8e1
# Parent  db488b9c65596088550b2b96ed25aff60a938219
fileset: combine union of basic patterns into single matcher

This appears to improve query performance in a big repository than I thought.
Writing less Python in a hot loop, faster computation we gain.

  $ hg files --cwd mozilla-central --time 'set:a* + b* + c* + d* + e*'
  (orig) time: real 0.670 secs (user 0.640+0.000 sys 0.030+0.000)
  (new)  time: real 0.210 secs (user 0.180+0.000 sys 0.020+0.000)
via Mercurial-devel - Aug. 5, 2018, 5:49 a.m.
On Sat, Aug 4, 2018 at 5:15 AM Yuya Nishihara <yuya@tcha.org> wrote:

> # HG changeset patch
> # User Yuya Nishihara <yuya@tcha.org>
> # Date 1532161152 -32400
> #      Sat Jul 21 17:19:12 2018 +0900
> # Node ID 33247ffb2c71b954e229da8424798d52c1e0d8e1
> # Parent  db488b9c65596088550b2b96ed25aff60a938219
> fileset: combine union of basic patterns into single matcher
>

Queuing this, thanks.

Patch

diff --git a/mercurial/fileset.py b/mercurial/fileset.py
--- a/mercurial/fileset.py
+++ b/mercurial/fileset.py
@@ -50,6 +50,12 @@  def kindpatmatch(mctx, x, y):
     return stringmatch(mctx, _getkindpat(x, y, matchmod.allpatternkinds,
                                          _("pattern must be a string")))
 
+def patternsmatch(mctx, *xs):
+    allkinds = matchmod.allpatternkinds
+    patterns = [getpattern(x, allkinds, _("pattern must be a string"))
+                for x in xs]
+    return mctx.matcher(patterns)
+
 def andmatch(mctx, x, y):
     xm = getmatch(mctx, x)
     ym = getmatch(mctx, y)
@@ -436,6 +442,7 @@  methods = {
     'string': stringmatch,
     'symbol': stringmatch,
     'kindpat': kindpatmatch,
+    'patterns': patternsmatch,
     'and': andmatch,
     'or': ormatch,
     'minus': minusmatch,
diff --git a/mercurial/filesetlang.py b/mercurial/filesetlang.py
--- a/mercurial/filesetlang.py
+++ b/mercurial/filesetlang.py
@@ -185,6 +185,21 @@  def _optimizeandops(op, ta, tb):
         return ('minus', ta, tb[1])
     return (op, ta, tb)
 
+def _optimizeunion(xs):
+    # collect string patterns so they can be compiled into a single regexp
+    ws, ts, ss = [], [], []
+    for x in xs:
+        w, t = _optimize(x)
+        if t is not None and t[0] in {'string', 'symbol', 'kindpat'}:
+            ss.append(t)
+            continue
+        ws.append(w)
+        ts.append(t)
+    if ss:
+        ws.append(WEIGHT_CHECK_FILENAME)
+        ts.append(('patterns',) + tuple(ss))
+    return ws, ts
+
 def _optimize(x):
     if x is None:
         return 0, x
@@ -206,7 +221,9 @@  def _optimize(x):
         else:
             return wb, _optimizeandops(op, tb, ta)
     if op == 'or':
-        ws, ts = zip(*(_optimize(y) for y in x[1:]))
+        ws, ts = _optimizeunion(x[1:])
+        if len(ts) == 1:
+            return ws[0], ts[0] # 'or' operation is fully optimized out
         ts = tuple(it[1] for it in sorted(enumerate(ts),
                                           key=lambda it: ws[it[0]]))
         return max(ws), (op,) + ts
diff --git a/mercurial/minifileset.py b/mercurial/minifileset.py
--- a/mercurial/minifileset.py
+++ b/mercurial/minifileset.py
@@ -40,7 +40,7 @@  def _compile(tree):
             return f
         raise error.ParseError(_("unsupported file pattern: %s") % name,
                                hint=_('paths must be prefixed with "path:"'))
-    elif op == 'or':
+    elif op in {'or', 'patterns'}:
         funcs = [_compile(x) for x in tree[1:]]
         return lambda n, s: any(f(n, s) for f in funcs)
     elif op == 'and':
diff --git a/tests/test-fileset.t b/tests/test-fileset.t
--- a/tests/test-fileset.t
+++ b/tests/test-fileset.t
@@ -53,9 +53,7 @@  Test operators and basic patterns
       (symbol 'glob')
       (symbol 'b?')))
   * matcher:
-  <unionmatcher matchers=[
-    <patternmatcher patterns='(?:a1(?:/|$))'>,
-    <patternmatcher patterns='(?:b.$)'>]>
+  <patternmatcher patterns='(?:a1(?:/|$)|b.$)'>
   a1
   b1
   b2
@@ -182,8 +180,9 @@  Show parsed tree at stages:
         None)))
   * optimized:
   (or
-    (symbol 'a1')
-    (symbol 'a2')
+    (patterns
+      (symbol 'a1')
+      (symbol 'a2'))
     (and
       (func
         (symbol 'clean')
@@ -193,8 +192,7 @@  Show parsed tree at stages:
         (string 'b'))))
   * matcher:
   <unionmatcher matchers=[
-    <patternmatcher patterns='(?:a1$)'>,
-    <patternmatcher patterns='(?:a2$)'>,
+    <patternmatcher patterns='(?:a1$|a2$)'>,
     <intersectionmatcher
       m1=<predicatenmatcher pred=clean>,
       m2=<predicatenmatcher pred=grep('b')>>]>
@@ -203,13 +201,30 @@  Show parsed tree at stages:
   b1
   b2
 
+Union of basic patterns:
+
+  $ fileset -p optimized -s -r. 'a1 or a2 or path:b1'
+  * optimized:
+  (patterns
+    (symbol 'a1')
+    (symbol 'a2')
+    (kindpat
+      (symbol 'path')
+      (symbol 'b1')))
+  * matcher:
+  <patternmatcher patterns='(?:a1$|a2$|b1(?:/|$))'>
+  a1
+  a2
+  b1
+
 OR expression should be reordered by weight:
 
   $ fileset -p optimized -s -r. 'grep("a") or a1 or grep("b") or b2'
   * optimized:
   (or
-    (symbol 'a1')
-    (symbol 'b2')
+    (patterns
+      (symbol 'a1')
+      (symbol 'b2'))
     (func
       (symbol 'grep')
       (string 'a'))
@@ -218,8 +233,7 @@  OR expression should be reordered by wei
       (string 'b')))
   * matcher:
   <unionmatcher matchers=[
-    <patternmatcher patterns='(?:a1$)'>,
-    <patternmatcher patterns='(?:b2$)'>,
+    <patternmatcher patterns='(?:a1$|b2$)'>,
     <predicatenmatcher pred=grep('a')>,
     <predicatenmatcher pred=grep('b')>]>
   a1