Submitter | Jun Wu |
---|---|
Date | Nov. 16, 2016, 4:17 p.m. |
Message ID | <232935ca927d68bd5bed.1479313023@x1c> |
Download | mbox | patch |
Permalink | /patch/17599/ |
State | Accepted |
Headers | show |
Comments
On Wed, Nov 16, 2016 at 04:17:03PM +0000, Jun Wu wrote: > # HG changeset patch > # User Jun Wu <quark@fb.com> > # Date 1479241551 0 > # Tue Nov 15 20:25:51 2016 +0000 > # Node ID 232935ca927d68bd5bed66b674dfd58cbb13805d > # Parent d1a0a64f6e16432333bea0476098c46a61222b9b > # Available At https://bitbucket.org/quark-zju/hg-draft > # hg pull https://bitbucket.org/quark-zju/hg-draft -r 232935ca927d > util: improve iterfile so it chooses code path wisely Queued. Sigh. > > We have performance concerns on "iterfile" as it is 4X slower on normal > files. While modern systems have the nice property that reading a "fast" > (on-disk) file cannot be interrupted and should be made use of. > > This patch dumps the related knowledge in comments. And "iterfile" chooses > code paths wisely: > > 1. If it's CPython 3, or PyPY, use the fast path. > 2. If fp is a normal file, use the fast path. > 3. If fp is not a normal file and CPython version >= 2.7.4, use the same > workaround (4x slower) as before. > 4. If fp is not a normal file and CPython version < 2.7.4, use another > workaround (2x slower but may block longer then necessary) which > basically re-invents the buffer + readline logic in Python. > > This will give us good confidence on both correctness and performance > dealing with EINTR in iterfile(fp) for all known supported Python versions. > > diff --git a/mercurial/util.py b/mercurial/util.py > --- a/mercurial/util.py > +++ b/mercurial/util.py > @@ -25,8 +25,10 @@ import hashlib > import imp > import os > +import platform as pyplatform > import re as remod > import shutil > import signal > import socket > +import stat > import string > import subprocess > @@ -2191,8 +2193,75 @@ def wrap(line, width, initindent='', han > return wrapper.fill(line).encode(encoding.encoding) > > -def iterfile(fp): > - """like fp.__iter__ but does not have issues with EINTR. Python 2.7.12 is > - known to have such issues.""" > - return iter(fp.readline, '') > +if (pyplatform.python_implementation() == 'CPython' and > + sys.version_info < (3, 0)): > + # There is an issue in CPython that some IO methods do not handle EINTR > + # correctly. The following table shows what CPython version (and functions) > + # are affected (buggy: has the EINTR bug, okay: otherwise): > + # > + # | < 2.7.4 | 2.7.4 to 2.7.12 | >= 3.0 > + # -------------------------------------------------- > + # fp.__iter__ | buggy | buggy | okay > + # fp.read* | buggy | okay [1] | okay > + # > + # [1]: fixed by changeset 67dc99a989cd in the cpython hg repo. > + # > + # Here we workaround the EINTR issue for fileobj.__iter__. Other methods > + # like "read*" are ignored for now, as Python < 2.7.4 is a minority. > + # > + # Although we can workaround the EINTR issue for fp.__iter__, it is slower: > + # "for x in fp" is 4x faster than "for x in iter(fp.readline, '')" in > + # CPython 2, because CPython 2 maintains an internal readahead buffer for > + # fp.__iter__ but not other fp.read* methods. > + # > + # On modern systems like Linux, the "read" syscall cannot be interrupted > + # when reading "fast" files like on-disk files. So the EINTR issue only > + # affects things like pipes, sockets, ttys etc. We treat "normal" (S_ISREG) > + # files approximately as "fast" files and use the fast (unsafe) code path, > + # to minimize the performance impact. > + if sys.version_info >= (2, 7, 4): > + # fp.readline deals with EINTR correctly, use it as a workaround. > + def _safeiterfile(fp): > + return iter(fp.readline, '') > + else: > + # fp.read* are broken too, manually deal with EINTR in a stupid way. > + # note: this may block longer than necessary because of bufsize. > + def _safeiterfile(fp, bufsize=4096): > + fd = fp.fileno() > + line = '' > + while True: > + try: > + buf = os.read(fd, bufsize) > + except OSError as ex: > + # os.read only raises EINTR before any data is read > + if ex.errno == errno.EINTR: > + continue > + else: > + raise > + line += buf > + if '\n' in buf: > + splitted = line.splitlines(True) > + line = '' > + for l in splitted: > + if l[-1] == '\n': > + yield l > + else: > + line = l > + if not buf: > + break > + if line: > + yield line > + > + def iterfile(fp): > + fastpath = True > + if type(fp) is file: > + fastpath = stat.S_ISREG(os.fstat(fp.fileno()).st_mode) > + if fastpath: > + return fp > + else: > + return _safeiterfile(fp) > +else: > + # PyPy and CPython 3 do not have the EINTR issue thus no workaround needed. > + def iterfile(fp): > + return fp > > def iterlines(iterator): > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@mercurial-scm.org > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
Patch
diff --git a/mercurial/util.py b/mercurial/util.py --- a/mercurial/util.py +++ b/mercurial/util.py @@ -25,8 +25,10 @@ import hashlib import imp import os +import platform as pyplatform import re as remod import shutil import signal import socket +import stat import string import subprocess @@ -2191,8 +2193,75 @@ def wrap(line, width, initindent='', han return wrapper.fill(line).encode(encoding.encoding) -def iterfile(fp): - """like fp.__iter__ but does not have issues with EINTR. Python 2.7.12 is - known to have such issues.""" - return iter(fp.readline, '') +if (pyplatform.python_implementation() == 'CPython' and + sys.version_info < (3, 0)): + # There is an issue in CPython that some IO methods do not handle EINTR + # correctly. The following table shows what CPython version (and functions) + # are affected (buggy: has the EINTR bug, okay: otherwise): + # + # | < 2.7.4 | 2.7.4 to 2.7.12 | >= 3.0 + # -------------------------------------------------- + # fp.__iter__ | buggy | buggy | okay + # fp.read* | buggy | okay [1] | okay + # + # [1]: fixed by changeset 67dc99a989cd in the cpython hg repo. + # + # Here we workaround the EINTR issue for fileobj.__iter__. Other methods + # like "read*" are ignored for now, as Python < 2.7.4 is a minority. + # + # Although we can workaround the EINTR issue for fp.__iter__, it is slower: + # "for x in fp" is 4x faster than "for x in iter(fp.readline, '')" in + # CPython 2, because CPython 2 maintains an internal readahead buffer for + # fp.__iter__ but not other fp.read* methods. + # + # On modern systems like Linux, the "read" syscall cannot be interrupted + # when reading "fast" files like on-disk files. So the EINTR issue only + # affects things like pipes, sockets, ttys etc. We treat "normal" (S_ISREG) + # files approximately as "fast" files and use the fast (unsafe) code path, + # to minimize the performance impact. + if sys.version_info >= (2, 7, 4): + # fp.readline deals with EINTR correctly, use it as a workaround. + def _safeiterfile(fp): + return iter(fp.readline, '') + else: + # fp.read* are broken too, manually deal with EINTR in a stupid way. + # note: this may block longer than necessary because of bufsize. + def _safeiterfile(fp, bufsize=4096): + fd = fp.fileno() + line = '' + while True: + try: + buf = os.read(fd, bufsize) + except OSError as ex: + # os.read only raises EINTR before any data is read + if ex.errno == errno.EINTR: + continue + else: + raise + line += buf + if '\n' in buf: + splitted = line.splitlines(True) + line = '' + for l in splitted: + if l[-1] == '\n': + yield l + else: + line = l + if not buf: + break + if line: + yield line + + def iterfile(fp): + fastpath = True + if type(fp) is file: + fastpath = stat.S_ISREG(os.fstat(fp.fileno()).st_mode) + if fastpath: + return fp + else: + return _safeiterfile(fp) +else: + # PyPy and CPython 3 do not have the EINTR issue thus no workaround needed. + def iterfile(fp): + return fp def iterlines(iterator):