Patchwork util: improve iterfile so it impacts little on performance

login
register
mail settings
Submitter Jun Wu
Date Nov. 15, 2016, 4:04 p.m.
Message ID <3cd2e9873bc1d565300b.1479225853@x1c>
Download mbox | patch
Permalink /patch/17586/
State Superseded
Delegated to: Jun Wu
Headers show

Comments

Jun Wu - Nov. 15, 2016, 4:04 p.m.
# HG changeset patch
# User Jun Wu <quark@fb.com>
# Date 1479225350 0
#      Tue Nov 15 15:55:50 2016 +0000
# Node ID 3cd2e9873bc1d565300b629e72100800075d12bb
# Parent  d1a0a64f6e16432333bea0476098c46a61222b9b
# Available At https://bitbucket.org/quark-zju/hg-draft
#              hg pull https://bitbucket.org/quark-zju/hg-draft -r 3cd2e9873bc1
util: improve iterfile so it impacts little on performance

We have performance concerns on "iterfile" as it is 4X slower on normal
files. While modern systems have the nice property that reading a "fast"
(on-disk) file cannot be interrupted and should be made use of.

This patch dumps the related knowledge in comments. And tries to minimize
the performance impact: it only use the slower but safer approach for
non-normal files. It gives up for Python < 2.7.4 because the slower approach
does not make a difference in terms of safety. And it avoids the workaround
for Python >= 3 and PyPy who don't have the EINTR issue.
Jun Wu - Nov. 15, 2016, 5:43 p.m.
Excerpts from Jun Wu's message of 2016-11-15 16:04:13 +0000:
> # HG changeset patch
> # User Jun Wu <quark@fb.com>
> # Date 1479225350 0
> #      Tue Nov 15 15:55:50 2016 +0000
> # Node ID 3cd2e9873bc1d565300b629e72100800075d12bb
> # Parent  d1a0a64f6e16432333bea0476098c46a61222b9b
> # Available At https://bitbucket.org/quark-zju/hg-draft 
> #              hg pull https://bitbucket.org/quark-zju/hg-draft  -r 3cd2e9873bc1
> util: improve iterfile so it impacts little on performance
> 
> We have performance concerns on "iterfile" as it is 4X slower on normal
> files. While modern systems have the nice property that reading a "fast"
> (on-disk) file cannot be interrupted and should be made use of.
> 
> This patch dumps the related knowledge in comments. And tries to minimize
> the performance impact: it only use the slower but safer approach for
> non-normal files. It gives up for Python < 2.7.4 because the slower approach

I think I can fix Python < 2.7.4 using some slow code path as well.
So I'm dropping this one. A new version is coming.

> does not make a difference in terms of safety. And it avoids the workaround
> for Python >= 3 and PyPy who don't have the EINTR issue.
> 
> diff --git a/mercurial/util.py b/mercurial/util.py
> --- a/mercurial/util.py
> +++ b/mercurial/util.py
> @@ -25,8 +25,10 @@ import hashlib
>  import imp
>  import os
> +import platform as pyplatform
>  import re as remod
>  import shutil
>  import signal
>  import socket
> +import stat
>  import string
>  import subprocess
> @@ -2191,8 +2193,31 @@ def wrap(line, width, initindent='', han
>      return wrapper.fill(line).encode(encoding.encoding)
>  
> -def iterfile(fp):
> -    """like fp.__iter__ but does not have issues with EINTR. Python 2.7.12 is
> -    known to have such issues."""
> -    return iter(fp.readline, '')
> +if (pyplatform.python_implementation() == 'CPython' and
> +    sys.version_info <= (3, 0) and sys.version_info >= (2, 7, 4)):
> +    # There is an issue with CPython 2 that file.__iter__ does not handle EINTR
> +    # correctly. CPython <= 2.7.12 is known to have the issue.
> +    # In CPython >= 2.7.4, file.read, file.readline etc. deal with EINTR
> +    # correctly so we can use the workaround below. However the workaround is
> +    # about 4X slower than the native iterator because the latter does
> +    # readahead caching in CPython layer.
> +    # On modern systems like Linux, the "read" syscall cannot be interrupted
> +    # for reading "fast" files like on-disk files. So the EINTR issue only
> +    # affects things like pipes, sockets, ttys etc. We treat "normal" (S_ISREG)
> +    # files approximately as "fast" files and use the fast (unsafe) code path.
> +    def iterfile(fp):
> +        fastpath = True
> +        try:
> +            fastpath = stat.S_ISREG(os.fstat(fp.fileno()).st_mode)
> +        except (AttributeError, OSError): # no fileno, or stat fails
> +            pass
> +        if fastpath:
> +            return fp
> +        else:
> +            return iter(fp.readline, '')
> +else:
> +    # For CPython < 2.7.4, the workaround wouldn't make things better.
> +    # PyPy and CPython 3 do not have the EINTR issue thus no workaround needed.
> +    def iterfile(fp):
> +        return fp
>  
>  def iterlines(iterator):

Patch

diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -25,8 +25,10 @@  import hashlib
 import imp
 import os
+import platform as pyplatform
 import re as remod
 import shutil
 import signal
 import socket
+import stat
 import string
 import subprocess
@@ -2191,8 +2193,31 @@  def wrap(line, width, initindent='', han
     return wrapper.fill(line).encode(encoding.encoding)
 
-def iterfile(fp):
-    """like fp.__iter__ but does not have issues with EINTR. Python 2.7.12 is
-    known to have such issues."""
-    return iter(fp.readline, '')
+if (pyplatform.python_implementation() == 'CPython' and
+    sys.version_info <= (3, 0) and sys.version_info >= (2, 7, 4)):
+    # There is an issue with CPython 2 that file.__iter__ does not handle EINTR
+    # correctly. CPython <= 2.7.12 is known to have the issue.
+    # In CPython >= 2.7.4, file.read, file.readline etc. deal with EINTR
+    # correctly so we can use the workaround below. However the workaround is
+    # about 4X slower than the native iterator because the latter does
+    # readahead caching in CPython layer.
+    # On modern systems like Linux, the "read" syscall cannot be interrupted
+    # for reading "fast" files like on-disk files. So the EINTR issue only
+    # affects things like pipes, sockets, ttys etc. We treat "normal" (S_ISREG)
+    # files approximately as "fast" files and use the fast (unsafe) code path.
+    def iterfile(fp):
+        fastpath = True
+        try:
+            fastpath = stat.S_ISREG(os.fstat(fp.fileno()).st_mode)
+        except (AttributeError, OSError): # no fileno, or stat fails
+            pass
+        if fastpath:
+            return fp
+        else:
+            return iter(fp.readline, '')
+else:
+    # For CPython < 2.7.4, the workaround wouldn't make things better.
+    # PyPy and CPython 3 do not have the EINTR issue thus no workaround needed.
+    def iterfile(fp):
+        return fp
 
 def iterlines(iterator):