Patchwork [1,of,2,STABLE] revset: make sort() do dumb multi-pass sorting for multiple keys (issue5218)

login
register
mail settings
Submitter Yuya Nishihara
Date April 23, 2016, noon
Message ID <0c66025c95e7e11ce7fe.1461412815@mimosa>
Download mbox | patch
Permalink /patch/14767/
State Accepted
Commit 923fa9e06ea000d1adcde0cced14e9f3145ecfb4
Headers show

Comments

Yuya Nishihara - April 23, 2016, noon
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1461395370 -32400
#      Sat Apr 23 16:09:30 2016 +0900
# Branch stable
# Node ID 0c66025c95e7e11ce7fea2be633d995fd67b6033
# Parent  2d3837a4bded5362f26f91033c0a83376c207593
revset: make sort() do dumb multi-pass sorting for multiple keys (issue5218)

Our invert() function was too clever to not take length into account. I could
fix the problem by appending '\xff' as a terminator (opposite to '\0'), but
it turned out to be slower than simple multi-pass sorting.

New implementation is pretty straightforward, which just calls sort() from the
last key. We can do that since Python sort() is guaranteed to be stable. It
doesn't sound nice to call sort() multiple times, but actually it is faster.
That's probably because we have fewer Python codes in hot loop, and can avoid
heavy string and list manipulation.

  revset #0: sort(0:10000, 'branch')
  0) 0.412753
  1) 0.393254

  revset #1: sort(0:10000, '-branch')
  0) 0.455377
  1) 0.389191  85%

  revset #2: sort(0:10000, 'date')
  0) 0.408082
  1) 0.376332  92%

  revset #3: sort(0:10000, '-date')
  0) 0.406910
  1) 0.380498  93%

  revset #4: sort(0:10000, 'desc branch user date rev')
  0) 0.542996
  1) 0.486397  89%

  revset #5: sort(0:10000, '-desc -branch -user -date -rev')
  0) 0.965032
  1) 0.518426  53%

Patch

diff --git a/mercurial/revset.py b/mercurial/revset.py
--- a/mercurial/revset.py
+++ b/mercurial/revset.py
@@ -1859,9 +1859,6 @@  def sort(repo, subset, x):
 
     s = l[0]
     keys = keys.split()
-    l = []
-    def invert(s):
-        return "".join(chr(255 - ord(c)) for c in s)
     revs = getset(repo, subset, s)
     if keys == ["rev"]:
         revs.sort()
@@ -1869,36 +1866,33 @@  def sort(repo, subset, x):
     elif keys == ["-rev"]:
         revs.sort(reverse=True)
         return revs
-    for r in revs:
-        c = repo[r]
-        e = []
-        for k in keys:
+    # sort() is guaranteed to be stable
+    ctxs = [repo[r] for r in revs]
+    if True:
+        for k in reversed(keys):
             if k == 'rev':
-                e.append(r)
+                ctxs.sort(key=lambda c: c.rev())
             elif k == '-rev':
-                e.append(-r)
+                ctxs.sort(key=lambda c: c.rev(), reverse=True)
             elif k == 'branch':
-                e.append(c.branch())
+                ctxs.sort(key=lambda c: c.branch())
             elif k == '-branch':
-                e.append(invert(c.branch()))
+                ctxs.sort(key=lambda c: c.branch(), reverse=True)
             elif k == 'desc':
-                e.append(c.description())
+                ctxs.sort(key=lambda c: c.description())
             elif k == '-desc':
-                e.append(invert(c.description()))
+                ctxs.sort(key=lambda c: c.description(), reverse=True)
             elif k in 'user author':
-                e.append(c.user())
+                ctxs.sort(key=lambda c: c.user())
             elif k in '-user -author':
-                e.append(invert(c.user()))
+                ctxs.sort(key=lambda c: c.user(), reverse=True)
             elif k == 'date':
-                e.append(c.date()[0])
+                ctxs.sort(key=lambda c: c.date()[0])
             elif k == '-date':
-                e.append(-c.date()[0])
+                ctxs.sort(key=lambda c: c.date()[0], reverse=True)
             else:
                 raise error.ParseError(_("unknown sort key %r") % k)
-        e.append(r)
-        l.append(e)
-    l.sort()
-    return baseset([e[-1] for e in l])
+    return baseset([c.rev() for c in ctxs])
 
 @predicate('subrepo([pattern])')
 def subrepo(repo, subset, x):
diff --git a/tests/test-revset.t b/tests/test-revset.t
--- a/tests/test-revset.t
+++ b/tests/test-revset.t
@@ -952,6 +952,147 @@  test sorting two sorted collections in d
   6
   2
 
+  $ cd ..
+
+test sorting by multiple keys including variable-length strings
+
+  $ hg init sorting
+  $ cd sorting
+  $ cat <<EOF >> .hg/hgrc
+  > [ui]
+  > logtemplate = '{rev} {branch|p5}{desc|p5}{author|p5}{date|hgdate}\n'
+  > [templatealias]
+  > p5(s) = pad(s, 5)
+  > EOF
+  $ hg branch -qf b12
+  $ hg ci -m m111 -u u112 -d '111 10800'
+  $ hg branch -qf b11
+  $ hg ci -m m12 -u u111 -d '112 7200'
+  $ hg branch -qf b111
+  $ hg ci -m m11 -u u12 -d '111 3600'
+  $ hg branch -qf b112
+  $ hg ci -m m111 -u u11 -d '120 0'
+  $ hg branch -qf b111
+  $ hg ci -m m112 -u u111 -d '110 14400'
+  created new head
+
+ compare revisions (has fast path):
+
+  $ hg log -r 'sort(all(), rev)'
+  0 b12  m111 u112 111 10800
+  1 b11  m12  u111 112 7200
+  2 b111 m11  u12  111 3600
+  3 b112 m111 u11  120 0
+  4 b111 m112 u111 110 14400
+
+  $ hg log -r 'sort(all(), -rev)'
+  4 b111 m112 u111 110 14400
+  3 b112 m111 u11  120 0
+  2 b111 m11  u12  111 3600
+  1 b11  m12  u111 112 7200
+  0 b12  m111 u112 111 10800
+
+ compare variable-length strings (issue5218):
+
+  $ hg log -r 'sort(all(), branch)'
+  1 b11  m12  u111 112 7200
+  2 b111 m11  u12  111 3600
+  4 b111 m112 u111 110 14400
+  3 b112 m111 u11  120 0
+  0 b12  m111 u112 111 10800
+
+  $ hg log -r 'sort(all(), -branch)'
+  0 b12  m111 u112 111 10800
+  3 b112 m111 u11  120 0
+  2 b111 m11  u12  111 3600
+  4 b111 m112 u111 110 14400
+  1 b11  m12  u111 112 7200
+
+  $ hg log -r 'sort(all(), desc)'
+  2 b111 m11  u12  111 3600
+  0 b12  m111 u112 111 10800
+  3 b112 m111 u11  120 0
+  4 b111 m112 u111 110 14400
+  1 b11  m12  u111 112 7200
+
+  $ hg log -r 'sort(all(), -desc)'
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  0 b12  m111 u112 111 10800
+  3 b112 m111 u11  120 0
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'sort(all(), user)'
+  3 b112 m111 u11  120 0
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'sort(all(), -user)'
+  2 b111 m11  u12  111 3600
+  0 b12  m111 u112 111 10800
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  3 b112 m111 u11  120 0
+
+ compare dates (tz offset should have no effect):
+
+  $ hg log -r 'sort(all(), date)'
+  4 b111 m112 u111 110 14400
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+  1 b11  m12  u111 112 7200
+  3 b112 m111 u11  120 0
+
+  $ hg log -r 'sort(all(), -date)'
+  3 b112 m111 u11  120 0
+  1 b11  m12  u111 112 7200
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+  4 b111 m112 u111 110 14400
+
+ be aware that 'sort(x, -k)' is not exactly the same as 'reverse(sort(x, k))'
+ because '-k' reverses the comparison, not the list itself:
+
+  $ hg log -r 'sort(0 + 2, date)'
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'sort(0 + 2, -date)'
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'reverse(sort(0 + 2, date))'
+  2 b111 m11  u12  111 3600
+  0 b12  m111 u112 111 10800
+
+ sort by multiple keys:
+
+  $ hg log -r 'sort(all(), "branch -rev")'
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  2 b111 m11  u12  111 3600
+  3 b112 m111 u11  120 0
+  0 b12  m111 u112 111 10800
+
+  $ hg log -r 'sort(all(), "-desc -date")'
+  1 b11  m12  u111 112 7200
+  4 b111 m112 u111 110 14400
+  3 b112 m111 u11  120 0
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ hg log -r 'sort(all(), "user -branch date rev")'
+  3 b112 m111 u11  120 0
+  4 b111 m112 u111 110 14400
+  1 b11  m12  u111 112 7200
+  0 b12  m111 u112 111 10800
+  2 b111 m11  u12  111 3600
+
+  $ cd ..
+  $ cd repo
+
 test subtracting something from an addset
 
   $ log '(outgoing() or removes(a)) - removes(a)'