Patchwork [1,of,3] encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it

login
register
mail settings
Submitter Yuya Nishihara
Date May 7, 2018, 1:17 p.m.
Message ID <bcf0435282e041532b2c.1525699034@mimosa>
Download mbox | patch
Permalink /patch/31302/
State Accepted
Headers show

Comments

Yuya Nishihara - May 7, 2018, 1:17 p.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1524364733 -32400
#      Sun Apr 22 11:38:53 2018 +0900
# Node ID bcf0435282e041532b2c66c131d34180ac85bf94
# Parent  a7e53b70e5026bac9772f9869e754a2a6f530587
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it

If 's' is a localstr, 's._utf8' must be returned to get the original UTF-8
sequence back. Because of this, it was totally wrong to test if '"\xed" not
in s', which should be either '"\xed" not in s._utf8' or just omitted.

This patch moves the localstr handling to top as the validity of 's._utf8'
should be pre-checked by encoding.tolocal().

Patch

diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -504,11 +504,13 @@  def toutf8b(s):
     internal surrogate encoding as a UTF-8 string.)
     '''
 
-    if not isinstance(s, localstr) and isasciistr(s):
+    if isinstance(s, localstr):
+        # assume that the original UTF-8 sequence would never contain
+        # invalid characters in U+DCxx range
+        return s._utf8
+    elif isasciistr(s):
         return s
     if "\xed" not in s:
-        if isinstance(s, localstr):
-            return s._utf8
         try:
             s.decode('utf-8', _utf8strict)
             return s
diff --git a/tests/test-encoding-func.py b/tests/test-encoding-func.py
--- a/tests/test-encoding-func.py
+++ b/tests/test-encoding-func.py
@@ -35,11 +35,32 @@  class LocalEncodingTest(unittest.TestCas
         self.assertTrue(s is encoding.fromlocal(s))
 
 class Utf8bEncodingTest(unittest.TestCase):
+    def setUp(self):
+        self.origencoding = encoding.encoding
+
+    def tearDown(self):
+        encoding.encoding = self.origencoding
+
     def testasciifastpath(self):
         s = b'\0' * 100
         self.assertTrue(s is encoding.toutf8b(s))
         self.assertTrue(s is encoding.fromutf8b(s))
 
+    def testlossylatin(self):
+        encoding.encoding = b'ascii'
+        s = u'\xc0'.encode('utf-8')
+        l = encoding.tolocal(s)
+        self.assertEqual(l, b'?')  # lossy
+        self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
+
+    def testlossy0xed(self):
+        encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
+        s = u'\ud1bc\xc0'.encode('utf-8')
+        l = encoding.tolocal(s)
+        self.assertIn(b'\xed', l)
+        self.assertTrue(l.endswith(b'?'))  # lossy
+        self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
+
 if __name__ == '__main__':
     import silenttestrunner
     silenttestrunner.main(__name__)