Patchwork [2,of,3] encoding: introduce tagging type for non-lossy non-ASCII string

login
register
mail settings
Submitter Yuya Nishihara
Date May 7, 2018, 1:17 p.m.
Message ID <fdb6dfcc700e6e3c9751.1525699035@mimosa>
Download mbox | patch
Permalink /patch/31303/
State Accepted
Headers show

Comments

Yuya Nishihara - May 7, 2018, 1:17 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1492920930 -32400
#      Sun Apr 23 13:15:30 2017 +0900
# Node ID fdb6dfcc700e6e3c9751e7b4287bdc231acc27e9
# Parent  bcf0435282e041532b2c66c131d34180ac85bf94
encoding: introduce tagging type for non-lossy non-ASCII string

This fixes the weird behavior of toutf8b(), which would convert a local
string back to UTF-8 *only if* it was lossy in the system encoding.

Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded
losslessly (issue2763)", all local strings were wrapped by the localstr
class. I think this would justify the round-trip behavior of toutf8b().

ASCII strings are special-cased, so the cost of wrapping with safelocalstr
is negligible.

  (with mercurial repo)
  $ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1
  $ hg log --time --config experimental.evolution=all > /dev/null

  (original)
  time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000)
  time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000)
  time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000)

  (this patch)
  time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000)
  time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000)
  time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)

Patch

diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -93,6 +93,16 @@  class localstr(bytes):
     def __hash__(self):
         return hash(self._utf8) # avoid collisions in local string space
 
+class safelocalstr(bytes):
+    """Tagged string denoting it was previously an internal UTF-8 string,
+    and can be converted back to UTF-8 losslessly
+
+    >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
+    >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
+    >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
+    >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
+    """
+
 def tolocal(s):
     """
     Convert a string from internal UTF-8 to local encoding
@@ -140,7 +150,7 @@  def tolocal(s):
             r = u.encode(_sysstr(encoding), u"replace")
             if u == r.decode(_sysstr(encoding)):
                 # r is a safe, non-lossy encoding of s
-                return r
+                return safelocalstr(r)
             return localstr(s, r)
         except UnicodeDecodeError:
             # we should only get here if we're looking at an ancient changeset
@@ -149,7 +159,7 @@  def tolocal(s):
                 r = u.encode(_sysstr(encoding), u"replace")
                 if u == r.decode(_sysstr(encoding)):
                     # r is a safe, non-lossy encoding of s
-                    return r
+                    return safelocalstr(r)
                 return localstr(u.encode('UTF-8'), r)
             except UnicodeDecodeError:
                 u = s.decode("utf-8", "replace") # last ditch
@@ -402,7 +412,7 @@  def jsonescape(s, paranoid=False):
     JSON is problematic for us because it doesn't support non-Unicode
     bytes. To deal with this, we take the following approach:
 
-    - localstr objects are converted back to UTF-8
+    - localstr/safelocalstr objects are converted back to UTF-8
     - valid UTF-8/ASCII strings are passed as-is
     - other strings are converted to UTF-8b surrogate encoding
     - apply JSON-specified string escaping
@@ -495,6 +505,7 @@  def toutf8b(s):
     - local strings that have a cached known UTF-8 encoding (aka
       localstr) get sent as UTF-8 so Unicode-oriented clients get the
       Unicode data they want
+    - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
     - because we must preserve UTF-8 bytestring in places such as
       filenames, metadata can't be roundtripped without help
 
@@ -508,6 +519,10 @@  def toutf8b(s):
         # assume that the original UTF-8 sequence would never contain
         # invalid characters in U+DCxx range
         return s._utf8
+    elif isinstance(s, safelocalstr):
+        # already verified that s is non-lossy in legacy encoding, which
+        # shouldn't contain characters in U+DCxx range
+        return fromlocal(s)
     elif isasciistr(s):
         return s
     if "\xed" not in s:
diff --git a/mercurial/templatekw.py b/mercurial/templatekw.py
--- a/mercurial/templatekw.py
+++ b/mercurial/templatekw.py
@@ -278,6 +278,8 @@  def showdescription(context, mapping):
     if isinstance(s, encoding.localstr):
         # try hard to preserve utf-8 bytes
         return encoding.tolocal(encoding.fromlocal(s).strip())
+    elif isinstance(s, encoding.safelocalstr):
+        return encoding.safelocalstr(s.strip())
     else:
         return s.strip()
 
diff --git a/tests/test-command-template.t b/tests/test-command-template.t
--- a/tests/test-command-template.t
+++ b/tests/test-command-template.t
@@ -4682,6 +4682,13 @@  json filter should try round-trip conver
   $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0
   "non-ascii branch: \u00e9"
 
+json filter should take input as utf-8 if it was converted from utf-8:
+
+  $ HGENCODING=latin-1 hg log -T "{branch|json}\n" -r0
+  "\u00e9"
+  $ HGENCODING=latin-1 hg log -T "{desc|json}\n" -r0
+  "non-ascii branch: \u00e9"
+
 json filter takes input as utf-8b:
 
   $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1
diff --git a/tests/test-encoding-func.py b/tests/test-encoding-func.py
--- a/tests/test-encoding-func.py
+++ b/tests/test-encoding-func.py
@@ -53,6 +53,13 @@  class Utf8bEncodingTest(unittest.TestCas
         self.assertEqual(l, b'?')  # lossy
         self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
 
+    def testlosslesslatin(self):
+        encoding.encoding = b'latin-1'
+        s = u'\xc0'.encode('utf-8')
+        l = encoding.tolocal(s)
+        self.assertEqual(l, b'\xc0')  # lossless
+        self.assertEqual(s, encoding.toutf8b(l))  # convert back to utf-8
+
     def testlossy0xed(self):
         encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
         s = u'\ud1bc\xc0'.encode('utf-8')
@@ -61,6 +68,13 @@  class Utf8bEncodingTest(unittest.TestCas
         self.assertTrue(l.endswith(b'?'))  # lossy
         self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
 
+    def testlossless0xed(self):
+        encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
+        s = u'\ud1bc'.encode('utf-8')
+        l = encoding.tolocal(s)
+        self.assertEqual(l, b'\xc5\xed')  # lossless
+        self.assertEqual(s, encoding.toutf8b(l))  # convert back to utf-8
+
 if __name__ == '__main__':
     import silenttestrunner
     silenttestrunner.main(__name__)