Patchwork [8,of,8] py3: ditch email.parser.BytesParser which appears to be plain crap

login
register
mail settings
Submitter Yuya Nishihara
Date June 16, 2018, 11:06 a.m.
Message ID <eecc513212b2cacc491d.1529147174@mimosa>
Download mbox | patch
Permalink /patch/32194/
State Accepted
Headers show

Comments

Yuya Nishihara - June 16, 2018, 11:06 a.m.
# HG changeset patch
# User Yuya Nishihara <yuya@tcha.org>
# Date 1529145067 -32400
#      Sat Jun 16 19:31:07 2018 +0900
# Node ID eecc513212b2cacc491d898d62ad8edec5192974
# Parent  bc71920694ff3bb072daff3751a57ede528fcb8b
py3: ditch email.parser.BytesParser which appears to be plain crap

As I said before, BytesParser is a thin wrapper over the unicode Parser,
and it's too thin to return bytes back. Today, I found it does normalize
newline characters to '\n's thanks to the careless use of TextIOWrapper.

So, this patch replaces BytesParser with Parser + TextIOWrapper, and fix
newline handling. Since I don't know what's the least bad encoding strategy
here, I just copied it from BytesParser.

I've moved new parse() function from pycompat, as it is no longer a trivial
wrapper.

Patch

diff --git a/contrib/python3-whitelist b/contrib/python3-whitelist
--- a/contrib/python3-whitelist
+++ b/contrib/python3-whitelist
@@ -203,6 +203,7 @@  test-http.t
 test-hybridencode.py
 test-identify.t
 test-import-bypass.t
+test-import-eol.t
 test-import-merge.t
 test-import-unknown.t
 test-import.t
diff --git a/mercurial/mail.py b/mercurial/mail.py
--- a/mercurial/mail.py
+++ b/mercurial/mail.py
@@ -11,6 +11,8 @@  import email
 import email.charset
 import email.header
 import email.message
+import email.parser
+import io
 import os
 import smtplib
 import socket
@@ -322,6 +324,23 @@  def mimeencode(ui, s, charsets=None, dis
         s, cs = _encode(ui, s, charsets)
     return mimetextqp(s, 'plain', cs)
 
+if pycompat.ispy3:
+    def parse(fp):
+        ep = email.parser.Parser()
+        # disable the "universal newlines" mode, which isn't binary safe.
+        # I have no idea if ascii/surrogateescape is correct, but that's
+        # what the standard Python email parser does.
+        fp = io.TextIOWrapper(fp, encoding=r'ascii',
+                              errors=r'surrogateescape', newline=chr(10))
+        try:
+            return ep.parse(fp)
+        finally:
+            fp.detach()
+else:
+    def parse(fp):
+        ep = email.parser.Parser()
+        return ep.parse(fp)
+
 def headdecode(s):
     '''Decodes RFC-2047 header'''
     uparts = []
diff --git a/mercurial/patch.py b/mercurial/patch.py
--- a/mercurial/patch.py
+++ b/mercurial/patch.py
@@ -112,7 +112,7 @@  def split(stream):
             cur.append(line)
         c = chunk(cur)
 
-        m = pycompat.emailparser().parse(c)
+        m = mail.parse(c)
         if not m.is_multipart():
             yield msgfp(m)
         else:
@@ -230,7 +230,7 @@  def _extract(ui, fileobj, tmpname, tmpfp
 
     data = {}
 
-    msg = pycompat.emailparser().parse(fileobj)
+    msg = mail.parse(fileobj)
 
     subject = msg[r'Subject'] and mail.headdecode(msg[r'Subject'])
     data['user'] = msg[r'From'] and mail.headdecode(msg[r'From'])
diff --git a/mercurial/pycompat.py b/mercurial/pycompat.py
--- a/mercurial/pycompat.py
+++ b/mercurial/pycompat.py
@@ -295,10 +295,6 @@  if ispy3:
         ret = shlex.split(s.decode('latin-1'), comments, posix)
         return [a.encode('latin-1') for a in ret]
 
-    def emailparser(*args, **kwargs):
-        import email.parser
-        return email.parser.BytesParser(*args, **kwargs)
-
 else:
     import cStringIO
 
@@ -371,10 +367,6 @@  else:
     rawinput = raw_input
     getargspec = inspect.getargspec
 
-    def emailparser(*args, **kwargs):
-        import email.parser
-        return email.parser.Parser(*args, **kwargs)
-
 isjython = sysplatform.startswith('java')
 
 isdarwin = sysplatform == 'darwin'