From patchwork Sat Oct 29 08:53:20 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [v2] encoding: mercurial ignores setlocale and uses ascii instead of utf8 From: ehpc X-Patchwork-Id: 17224 Message-Id: <667842e5f8406e794f5b.1477731200@waste.org> To: mercurial-devel@mercurial-scm.org Date: Sat, 29 Oct 2016 03:53:20 -0500 # HG changeset patch # User ehpc # Date 1477731183 -10800 # Sat Oct 29 11:53:03 2016 +0300 # Node ID 667842e5f8406e794f5b22930f4b5715a1dcdfe4 # Parent 260af19891f2bed679a662be07d1379bb8207592 encoding: mercurial ignores setlocale and uses ascii instead of utf8 locale.getpreferredencoding() internally uses locale.setlocale(locale.LC_CTYPE, '') so even if a user sets locale explicitly via locale.setlocale(locale.LC_ALL, 'ru_RU.utf8') locale.setlocale(locale.LC_CTYPE, 'ru_RU.utf8') mercurial still detects ascii. There is also a problem with tolocal method even when encoding is detected correctly. If a string of type 'str' was fed to the method and encoding is UTF-8 it won't convert it to a proper 'unicode' string feeding back an 'str' instead. diff --git a/mercurial/encoding.py b/mercurial/encoding.py --- a/mercurial/encoding.py +++ b/mercurial/encoding.py @@ -93,7 +93,7 @@ try: encoding = environ.get("HGENCODING") if not encoding: - encoding = locale.getpreferredencoding() or 'ascii' + encoding = locale.getpreferredencoding(False) or 'ascii' encoding = _encodingfixers.get(encoding, lambda: encoding)() except locale.Error: encoding = 'ascii' @@ -146,11 +146,14 @@ try: try: + if encoding == 'UTF-8': + # fast path + if isinstance(s, unicode): + return s + else: + return s.decode('UTF-8') # make sure string is actually stored in UTF-8 u = s.decode('UTF-8') - if encoding == 'UTF-8': - # fast path - return s r = u.encode(_sysstr(encoding), u"replace") if u == r.decode(_sysstr(encoding)): # r is a safe, non-lossy encoding of s