From patchwork Mon May 7 13:17:15 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [2, of, 3] encoding: introduce tagging type for non-lossy non-ASCII string From: Yuya Nishihara X-Patchwork-Id: 31303 Message-Id: To: mercurial-devel@mercurial-scm.org Date: Mon, 07 May 2018 22:17:15 +0900 # HG changeset patch # User Yuya Nishihara # Date 1492920930 -32400 # Sun Apr 23 13:15:30 2017 +0900 # Node ID fdb6dfcc700e6e3c9751e7b4287bdc231acc27e9 # Parent bcf0435282e041532b2c66c131d34180ac85bf94 encoding: introduce tagging type for non-lossy non-ASCII string This fixes the weird behavior of toutf8b(), which would convert a local string back to UTF-8 *only if* it was lossy in the system encoding. Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded losslessly (issue2763)", all local strings were wrapped by the localstr class. I think this would justify the round-trip behavior of toutf8b(). ASCII strings are special-cased, so the cost of wrapping with safelocalstr is negligible. (with mercurial repo) $ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1 $ hg log --time --config experimental.evolution=all > /dev/null (original) time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000) time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000) time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000) (this patch) time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000) time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000) time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000) diff --git a/mercurial/encoding.py b/mercurial/encoding.py --- a/mercurial/encoding.py +++ b/mercurial/encoding.py @@ -93,6 +93,16 @@ class localstr(bytes): def __hash__(self): return hash(self._utf8) # avoid collisions in local string space +class safelocalstr(bytes): + """Tagged string denoting it was previously an internal UTF-8 string, + and can be converted back to UTF-8 losslessly + + >>> assert safelocalstr(b'\\xc3') == b'\\xc3' + >>> assert b'\\xc3' == safelocalstr(b'\\xc3') + >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} + >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} + """ + def tolocal(s): """ Convert a string from internal UTF-8 to local encoding @@ -140,7 +150,7 @@ def tolocal(s): r = u.encode(_sysstr(encoding), u"replace") if u == r.decode(_sysstr(encoding)): # r is a safe, non-lossy encoding of s - return r + return safelocalstr(r) return localstr(s, r) except UnicodeDecodeError: # we should only get here if we're looking at an ancient changeset @@ -149,7 +159,7 @@ def tolocal(s): r = u.encode(_sysstr(encoding), u"replace") if u == r.decode(_sysstr(encoding)): # r is a safe, non-lossy encoding of s - return r + return safelocalstr(r) return localstr(u.encode('UTF-8'), r) except UnicodeDecodeError: u = s.decode("utf-8", "replace") # last ditch @@ -402,7 +412,7 @@ def jsonescape(s, paranoid=False): JSON is problematic for us because it doesn't support non-Unicode bytes. To deal with this, we take the following approach: - - localstr objects are converted back to UTF-8 + - localstr/safelocalstr objects are converted back to UTF-8 - valid UTF-8/ASCII strings are passed as-is - other strings are converted to UTF-8b surrogate encoding - apply JSON-specified string escaping @@ -495,6 +505,7 @@ def toutf8b(s): - local strings that have a cached known UTF-8 encoding (aka localstr) get sent as UTF-8 so Unicode-oriented clients get the Unicode data they want + - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well - because we must preserve UTF-8 bytestring in places such as filenames, metadata can't be roundtripped without help @@ -508,6 +519,10 @@ def toutf8b(s): # assume that the original UTF-8 sequence would never contain # invalid characters in U+DCxx range return s._utf8 + elif isinstance(s, safelocalstr): + # already verified that s is non-lossy in legacy encoding, which + # shouldn't contain characters in U+DCxx range + return fromlocal(s) elif isasciistr(s): return s if "\xed" not in s: diff --git a/mercurial/templatekw.py b/mercurial/templatekw.py --- a/mercurial/templatekw.py +++ b/mercurial/templatekw.py @@ -278,6 +278,8 @@ def showdescription(context, mapping): if isinstance(s, encoding.localstr): # try hard to preserve utf-8 bytes return encoding.tolocal(encoding.fromlocal(s).strip()) + elif isinstance(s, encoding.safelocalstr): + return encoding.safelocalstr(s.strip()) else: return s.strip() diff --git a/tests/test-command-template.t b/tests/test-command-template.t --- a/tests/test-command-template.t +++ b/tests/test-command-template.t @@ -4682,6 +4682,13 @@ json filter should try round-trip conver $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0 "non-ascii branch: \u00e9" +json filter should take input as utf-8 if it was converted from utf-8: + + $ HGENCODING=latin-1 hg log -T "{branch|json}\n" -r0 + "\u00e9" + $ HGENCODING=latin-1 hg log -T "{desc|json}\n" -r0 + "non-ascii branch: \u00e9" + json filter takes input as utf-8b: $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1 diff --git a/tests/test-encoding-func.py b/tests/test-encoding-func.py --- a/tests/test-encoding-func.py +++ b/tests/test-encoding-func.py @@ -53,6 +53,13 @@ class Utf8bEncodingTest(unittest.TestCas self.assertEqual(l, b'?') # lossy self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved + def testlosslesslatin(self): + encoding.encoding = b'latin-1' + s = u'\xc0'.encode('utf-8') + l = encoding.tolocal(s) + self.assertEqual(l, b'\xc0') # lossless + self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 + def testlossy0xed(self): encoding.encoding = b'euc-kr' # U+Dxxx Hangul s = u'\ud1bc\xc0'.encode('utf-8') @@ -61,6 +68,13 @@ class Utf8bEncodingTest(unittest.TestCas self.assertTrue(l.endswith(b'?')) # lossy self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved + def testlossless0xed(self): + encoding.encoding = b'euc-kr' # U+Dxxx Hangul + s = u'\ud1bc'.encode('utf-8') + l = encoding.tolocal(s) + self.assertEqual(l, b'\xc5\xed') # lossless + self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8 + if __name__ == '__main__': import silenttestrunner silenttestrunner.main(__name__)