Submitter | Olivier Trempe |
---|---|
Date | April 5, 2017, 3:42 p.m. |
Message ID | <2021c3032968bef6b8d1.1491406933@ronce.innov.local> |
Download | mbox | patch |
Permalink | /patch/19971/ |
State | Superseded |
Headers | show |
Comments
On 4/5/17 08:42, Olivier Trempe wrote: > # HG changeset patch > # User Olivier Trempe <oliviertrempe@gmail.com> > # Date 1488981822 18000 > # Wed Mar 08 09:03:42 2017 -0500 > # Branch stable > # Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c > # Parent 68f263f52d2e3e2798b4f1e55cb665c6b043f93b > fsmonitor: match watchman and filesystem encoding > > watchman's paths encoding can differ from filesystem encoding. For example, > on Windows, it's always utf-8. > > Before this patch, on Windows, mismatch in path comparison between fsmonitor > state and osutil.statfiles would yield a clean status for added/modified files. > > In addition to status reporting wrong results, this leads to files being > discarded from changesets while doing history editing operations such as rebase. This patch looks correct to me, though I have questions about its performance below. +cc foozy for another look. > > diff --git a/hgext/fsmonitor/__init__.py b/hgext/fsmonitor/__init__.py > --- a/hgext/fsmonitor/__init__.py > +++ b/hgext/fsmonitor/__init__.py > @@ -91,6 +91,7 @@ > > from __future__ import absolute_import > > +import codecs > import hashlib > import os > import stat > @@ -99,6 +100,7 @@ > from mercurial import ( > context, > encoding, > + error, > extensions, > localrepo, > merge, > @@ -110,6 +112,7 @@ > from mercurial import match as matchmod > > from . import ( > + pywatchman, > state, > watchmanclient, > ) > @@ -159,6 +162,23 @@ > sha1.update('\0') > return sha1.hexdigest() > > +_watchmanencoding = pywatchman.encoding.get_local_encoding() > +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding() > +_fixencoding = codecs.lookup(_watchmanencoding) != codecs.lookup(_fsencoding) > + > +def _watchmantofsencoding(path): > + """Fix path to match watchman and local filesystem encoding > + > + watchman's paths encoding can differ from filesystem encoding. For example, > + on Windows, it's always utf-8. > + """ > + try: > + decoded = path.decode(_watchmanencoding) > + except UnicodeDecodeError as e: > + raise error.Abort(e, hint='watchman encoding error') Does this need to be str(e)? > + > + return decoded.encode(_fsencoding, 'replace') > + > def overridewalk(orig, self, match, subrepos, unknown, ignored, full=True): > '''Replacement for dirstate.walk, hooking into Watchman. > > @@ -303,6 +323,8 @@ > # for name case changes. > for entry in result['files']: > fname = entry['name'] > + if _fixencoding: > + fname = _watchmantofsencoding(fname) This is a critical path IIRC, so I'm a little concerned about performance here -- both on Mac/Linux where _fixencoding will (always?) be false and on Windows where it will (always?) be true. Any chance you you could measure the before and after with (say) 10,000 files in the result set? Thanks! > if switch_slashes: > fname = fname.replace('\\', '/') > if normalize:
On Wed, Apr 5, 2017 at 1:55 PM, Siddharth Agarwal <sid@less-broken.com> wrote: > > On 4/5/17 08:42, Olivier Trempe wrote: >> >> # HG changeset patch >> # User Olivier Trempe <oliviertrempe@gmail.com> >> # Date 1488981822 18000 >> # Wed Mar 08 09:03:42 2017 -0500 >> # Branch stable >> # Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c >> # Parent 68f263f52d2e3e2798b4f1e55cb665c6b043f93b >> fsmonitor: match watchman and filesystem encoding >> >> watchman's paths encoding can differ from filesystem encoding. For example, >> on Windows, it's always utf-8. >> >> Before this patch, on Windows, mismatch in path comparison between fsmonitor >> state and osutil.statfiles would yield a clean status for added/modified files. >> >> In addition to status reporting wrong results, this leads to files being >> discarded from changesets while doing history editing operations such as rebase. > > > This patch looks correct to me, though I have questions about its performance below. > > +cc foozy for another look. > > >> >> diff --git a/hgext/fsmonitor/__init__.py b/hgext/fsmonitor/__init__.py >> --- a/hgext/fsmonitor/__init__.py >> +++ b/hgext/fsmonitor/__init__.py >> @@ -91,6 +91,7 @@ >> from __future__ import absolute_import >> +import codecs >> import hashlib >> import os >> import stat >> @@ -99,6 +100,7 @@ >> from mercurial import ( >> context, >> encoding, >> + error, >> extensions, >> localrepo, >> merge, >> @@ -110,6 +112,7 @@ >> from mercurial import match as matchmod >> from . import ( >> + pywatchman, >> state, >> watchmanclient, >> ) >> @@ -159,6 +162,23 @@ >> sha1.update('\0') >> return sha1.hexdigest() >> +_watchmanencoding = pywatchman.encoding.get_local_encoding() >> +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding() >> +_fixencoding = codecs.lookup(_watchmanencoding) != codecs.lookup(_fsencoding) >> + >> +def _watchmantofsencoding(path): >> + """Fix path to match watchman and local filesystem encoding >> + >> + watchman's paths encoding can differ from filesystem encoding. For example, >> + on Windows, it's always utf-8. >> + """ >> + try: >> + decoded = path.decode(_watchmanencoding) >> + except UnicodeDecodeError as e: >> + raise error.Abort(e, hint='watchman encoding error') > > > Does this need to be str(e)? > Absolutely. This will be fixed in V3. I also realize the "import sys" statement was removed 3 months ago. I missed it when rebasing... This will also be fixed in V3. >> >> + >> + return decoded.encode(_fsencoding, 'replace') >> + >> def overridewalk(orig, self, match, subrepos, unknown, ignored, full=True): >> '''Replacement for dirstate.walk, hooking into Watchman. >> @@ -303,6 +323,8 @@ >> # for name case changes. >> for entry in result['files']: >> fname = entry['name'] >> + if _fixencoding: >> + fname = _watchmantofsencoding(fname) > > > This is a critical path IIRC, so I'm a little concerned about performance here -- both on Mac/Linux where _fixencoding will (always?) be false and on Windows where it will (always?) be true. > > Any chance you you could measure the before and after with (say) 10,000 files in the result set? Thanks! > There is a little overhead at module import: python -m timeit "import hgext.fsmonitor" Windows before patch: 1000000 loops, best of 3: 0.563 usec per loop Windows after patch: 1000000 loops, best of 3: 0.583 usec per loop Linx before patch: 1000000 loops, best of 3: 0.628 usec per loop Linux after patch: 1000000 loops, best of 3: 0.579 usec per loop 10000 calls to _watchmantofsencoding: python -m timeit -s "from hgext.fsmonitor import _watchmantofsencoding, _fixencoding" "fname = '/path/to/file'" "for i in range(10000):" " if _fixencoding: fname = _watchmantofsencoding(fname)" Windows (_fixencoding is True): 100 loops, best of 3: 19.5 msec per loop Linux (_fixencoding is False): 100 loops, best of 3: 3.08 msec per loop Do you want me to include these results in the commit message? > >> if switch_slashes: >> fname = fname.replace('\\', '/') >> if normalize: > > >
On 4/5/17 13:30, Olivier Trempe wrote: > Absolutely. This will be fixed in V3. > I also realize the "import sys" statement was removed 3 months ago. I > missed it when rebasing... This will also be fixed in V3. Thanks. In general, do you think you could add a test for the behavior where fsmonitor currently fails? I assume any non-ASCII filenames will cause issues. > > >> > >> + > >> + return decoded.encode(_fsencoding, 'replace') > >> + > >> def overridewalk(orig, self, match, subrepos, unknown, ignored, > full=True): > >> '''Replacement for dirstate.walk, hooking into Watchman. > >> @@ -303,6 +323,8 @@ > >> # for name case changes. > >> for entry in result['files']: > >> fname = entry['name'] > >> + if _fixencoding: > >> + fname = _watchmantofsencoding(fname) > > > > > > This is a critical path IIRC, so I'm a little concerned about > performance here -- both on Mac/Linux where _fixencoding will > (always?) be false and on Windows where it will (always?) be true. > > > > Any chance you you could measure the before and after with (say) > 10,000 files in the result set? Thanks! > > > > There is a little overhead at module import: > python -m timeit "import hgext.fsmonitor" > Windows before patch: 1000000 loops, best of 3: 0.563 usec per loop > Windows after patch: 1000000 loops, best of 3: 0.583 usec per loop > Linx before patch: 1000000 loops, best of 3: 0.628 usec per loop > Linux after patch: 1000000 loops, best of 3: 0.579 usec per loop > > 10000 calls to _watchmantofsencoding: > python -m timeit -s "from hgext.fsmonitor import > _watchmantofsencoding, _fixencoding" "fname = '/path/to/file'" "for i > in range(10000):" " if _fixencoding: fname = > _watchmantofsencoding(fname)" > Windows (_fixencoding is True): 100 loops, best of 3: 19.5 msec per loop > Linux (_fixencoding is False): 100 loops, best of 3: 3.08 msec per loop > > Do you want me to include these results in the commit message? Yes, that would be great! Thanks. > > > > >> if switch_slashes: > >> fname = fname.replace('\\', '/') > >> if normalize: > > > > > >
On Wed, 5 Apr 2017 10:55:17 -0700, Siddharth Agarwal wrote: > On 4/5/17 08:42, Olivier Trempe wrote: > > # HG changeset patch > > # User Olivier Trempe <oliviertrempe@gmail.com> > > # Date 1488981822 18000 > > # Wed Mar 08 09:03:42 2017 -0500 > > # Branch stable > > # Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c > > # Parent 68f263f52d2e3e2798b4f1e55cb665c6b043f93b > > fsmonitor: match watchman and filesystem encoding > > > > watchman's paths encoding can differ from filesystem encoding. For example, > > on Windows, it's always utf-8. > > > > Before this patch, on Windows, mismatch in path comparison between fsmonitor > > state and osutil.statfiles would yield a clean status for added/modified files. > > > > In addition to status reporting wrong results, this leads to files being > > discarded from changesets while doing history editing operations such as rebase. > > This patch looks correct to me, though I have questions about its > performance below. > > +cc foozy for another look. [...] > > +_watchmanencoding = pywatchman.encoding.get_local_encoding() > > +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding() > > +_fixencoding = codecs.lookup(_watchmanencoding) != codecs.lookup(_fsencoding) > > + > > +def _watchmantofsencoding(path): > > + """Fix path to match watchman and local filesystem encoding > > + > > + watchman's paths encoding can differ from filesystem encoding. For example, > > + on Windows, it's always utf-8. > > + """ > > + try: > > + decoded = path.decode(_watchmanencoding) > > + except UnicodeDecodeError as e: > > + raise error.Abort(e, hint='watchman encoding error') > > Does this need to be str(e)? Perhaps. > > > + > > + return decoded.encode(_fsencoding, 'replace') Maybe it's better to catch exception here. Encoding error would be more likely to happen because Windows ANSI charset is generally narrower than UTF-*.
On Thu, Apr 6, 2017 at 9:45 AM, Yuya Nishihara <yuya@tcha.org> wrote: > On Wed, 5 Apr 2017 10:55:17 -0700, Siddharth Agarwal wrote: > > On 4/5/17 08:42, Olivier Trempe wrote: > > > # HG changeset patch > > > # User Olivier Trempe <oliviertrempe@gmail.com> > > > # Date 1488981822 18000 > > > # Wed Mar 08 09:03:42 2017 -0500 > > > # Branch stable > > > # Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c > > > # Parent 68f263f52d2e3e2798b4f1e55cb665c6b043f93b > > > fsmonitor: match watchman and filesystem encoding > > > > > > watchman's paths encoding can differ from filesystem encoding. For > example, > > > on Windows, it's always utf-8. > > > > > > Before this patch, on Windows, mismatch in path comparison between > fsmonitor > > > state and osutil.statfiles would yield a clean status for > added/modified files. > > > > > > In addition to status reporting wrong results, this leads to files > being > > > discarded from changesets while doing history editing operations such > as rebase. > > > > This patch looks correct to me, though I have questions about its > > performance below. > > > > +cc foozy for another look. > > [...] > > > > +_watchmanencoding = pywatchman.encoding.get_local_encoding() > > > +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding() > > > +_fixencoding = codecs.lookup(_watchmanencoding) != > codecs.lookup(_fsencoding) > > > + > > > +def _watchmantofsencoding(path): > > > + """Fix path to match watchman and local filesystem encoding > > > + > > > + watchman's paths encoding can differ from filesystem encoding. > For example, > > > + on Windows, it's always utf-8. > > > + """ > > > + try: > > > + decoded = path.decode(_watchmanencoding) > > > + except UnicodeDecodeError as e: > > > + raise error.Abort(e, hint='watchman encoding error') > > > > Does this need to be str(e)? > > Perhaps. > > > > > + > > > + return decoded.encode(_fsencoding, 'replace') > > Maybe it's better to catch exception here. Encoding error would be more > likely > to happen because Windows ANSI charset is generally narrower than UTF-*. > You mean setting the error handler to 'strict' rather than 'replace' and wrap the call in a try except block? Or just wrap the call in a try except block, but keep the 'replace' error handler? Using the 'replace' error handler is necessary here to match the behavior of osutil.listdir
On Wed, Apr 5, 2017 at 4:43 PM, Siddharth Agarwal <sid@less-broken.com> wrote: > On 4/5/17 13:30, Olivier Trempe wrote: > >> Absolutely. This will be fixed in V3. >> I also realize the "import sys" statement was removed 3 months ago. I >> missed it when rebasing... This will also be fixed in V3. >> > > Thanks. > > In general, do you think you could add a test for the behavior where > fsmonitor currently fails? I assume any non-ASCII filenames will cause > issues. I just spent some time trying to write a test, but it is a real pain to make it work on Windows. It's already a feat to run mercurial tests on Windows. Then adding fsmonitor to this just adds another level of pain. I won't spend more time writing the test... > >> >> >> >> + >> >> + return decoded.encode(_fsencoding, 'replace') >> >> + >> >> def overridewalk(orig, self, match, subrepos, unknown, ignored, >> full=True): >> >> '''Replacement for dirstate.walk, hooking into Watchman. >> >> @@ -303,6 +323,8 @@ >> >> # for name case changes. >> >> for entry in result['files']: >> >> fname = entry['name'] >> >> + if _fixencoding: >> >> + fname = _watchmantofsencoding(fname) >> > >> > >> > This is a critical path IIRC, so I'm a little concerned about >> performance here -- both on Mac/Linux where _fixencoding will (always?) be >> false and on Windows where it will (always?) be true. >> > >> > Any chance you you could measure the before and after with (say) 10,000 >> files in the result set? Thanks! >> > >> >> There is a little overhead at module import: >> python -m timeit "import hgext.fsmonitor" >> Windows before patch: 1000000 loops, best of 3: 0.563 usec per loop >> Windows after patch: 1000000 loops, best of 3: 0.583 usec per loop >> Linx before patch: 1000000 loops, best of 3: 0.628 usec per loop >> Linux after patch: 1000000 loops, best of 3: 0.579 usec per loop >> >> 10000 calls to _watchmantofsencoding: >> python -m timeit -s "from hgext.fsmonitor import _watchmantofsencoding, >> _fixencoding" "fname = '/path/to/file'" "for i in range(10000):" " if >> _fixencoding: fname = _watchmantofsencoding(fname)" >> Windows (_fixencoding is True): 100 loops, best of 3: 19.5 msec per loop >> Linux (_fixencoding is False): 100 loops, best of 3: 3.08 msec per loop >> >> Do you want me to include these results in the commit message? >> > > Yes, that would be great! Thanks. > > > >> > >> >> if switch_slashes: >> >> fname = fname.replace('\\', '/') >> >> if normalize: >> > >> > >> > >> > > >
On Thu, 6 Apr 2017 12:46:38 -0400, Olivier Trempe wrote: > On Thu, Apr 6, 2017 at 9:45 AM, Yuya Nishihara <yuya@tcha.org> wrote: > > > > +def _watchmantofsencoding(path): > > > > + """Fix path to match watchman and local filesystem encoding > > > > + > > > > + watchman's paths encoding can differ from filesystem encoding. > > For example, > > > > + on Windows, it's always utf-8. > > > > + """ > > > > + try: > > > > + decoded = path.decode(_watchmanencoding) > > > > + except UnicodeDecodeError as e: > > > > + raise error.Abort(e, hint='watchman encoding error') > > > > > > Does this need to be str(e)? > > > > Perhaps. > > > > > > > + > > > > + return decoded.encode(_fsencoding, 'replace') > > > > Maybe it's better to catch exception here. Encoding error would be more > > likely > > to happen because Windows ANSI charset is generally narrower than UTF-*. > > > > You mean setting the error handler to 'strict' rather than 'replace' and > wrap the call in a try except block? Yes. > Or just wrap the call in a try except block, but keep the 'replace' error > handler? > Using the 'replace' error handler is necessary here to match the behavior > of osutil.listdir It appears 'mbcs' codec replaces unknown characters no matter if 'strict' is specified or not. Perhaps that would be done by WideCharToMultiByte(). I think using 'strict' here is more consistent because osutil.listdir() handles nothing about encoding in Python layer. https://msdn.microsoft.com/en-us/library/windows/desktop/dd374130(v=vs.85).aspx
Patch
diff --git a/hgext/fsmonitor/__init__.py b/hgext/fsmonitor/__init__.py --- a/hgext/fsmonitor/__init__.py +++ b/hgext/fsmonitor/__init__.py @@ -91,6 +91,7 @@ from __future__ import absolute_import +import codecs import hashlib import os import stat @@ -99,6 +100,7 @@ from mercurial import ( context, encoding, + error, extensions, localrepo, merge, @@ -110,6 +112,7 @@ from mercurial import match as matchmod from . import ( + pywatchman, state, watchmanclient, ) @@ -159,6 +162,23 @@ sha1.update('\0') return sha1.hexdigest() +_watchmanencoding = pywatchman.encoding.get_local_encoding() +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding() +_fixencoding = codecs.lookup(_watchmanencoding) != codecs.lookup(_fsencoding) + +def _watchmantofsencoding(path): + """Fix path to match watchman and local filesystem encoding + + watchman's paths encoding can differ from filesystem encoding. For example, + on Windows, it's always utf-8. + """ + try: + decoded = path.decode(_watchmanencoding) + except UnicodeDecodeError as e: + raise error.Abort(e, hint='watchman encoding error') + + return decoded.encode(_fsencoding, 'replace') + def overridewalk(orig, self, match, subrepos, unknown, ignored, full=True): '''Replacement for dirstate.walk, hooking into Watchman. @@ -303,6 +323,8 @@ # for name case changes. for entry in result['files']: fname = entry['name'] + if _fixencoding: + fname = _watchmantofsencoding(fname) if switch_slashes: fname = fname.replace('\\', '/') if normalize: