Patchwork [V2] fsmonitor: match watchman and filesystem encoding

login
register
mail settings
Submitter Olivier Trempe
Date April 5, 2017, 3:42 p.m.
Message ID <2021c3032968bef6b8d1.1491406933@ronce.innov.local>
Download mbox | patch
Permalink /patch/19971/
State Superseded
Headers show

Comments

Olivier Trempe - April 5, 2017, 3:42 p.m.
# HG changeset patch
# User Olivier Trempe <oliviertrempe@gmail.com>
# Date 1488981822 18000
#      Wed Mar 08 09:03:42 2017 -0500
# Branch stable
# Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c
# Parent  68f263f52d2e3e2798b4f1e55cb665c6b043f93b
fsmonitor: match watchman and filesystem encoding

watchman's paths encoding can differ from filesystem encoding. For example,
on Windows, it's always utf-8.

Before this patch, on Windows, mismatch in path comparison between fsmonitor
state and osutil.statfiles would yield a clean status for added/modified files.

In addition to status reporting wrong results, this leads to files being
discarded from changesets while doing history editing operations such as rebase.
Siddharth Agarwal - April 5, 2017, 5:55 p.m.
On 4/5/17 08:42, Olivier Trempe wrote:
> # HG changeset patch
> # User Olivier Trempe <oliviertrempe@gmail.com>
> # Date 1488981822 18000
> #      Wed Mar 08 09:03:42 2017 -0500
> # Branch stable
> # Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c
> # Parent  68f263f52d2e3e2798b4f1e55cb665c6b043f93b
> fsmonitor: match watchman and filesystem encoding
>
> watchman's paths encoding can differ from filesystem encoding. For example,
> on Windows, it's always utf-8.
>
> Before this patch, on Windows, mismatch in path comparison between fsmonitor
> state and osutil.statfiles would yield a clean status for added/modified files.
>
> In addition to status reporting wrong results, this leads to files being
> discarded from changesets while doing history editing operations such as rebase.

This patch looks correct to me, though I have questions about its 
performance below.

+cc foozy for another look.

>
> diff --git a/hgext/fsmonitor/__init__.py b/hgext/fsmonitor/__init__.py
> --- a/hgext/fsmonitor/__init__.py
> +++ b/hgext/fsmonitor/__init__.py
> @@ -91,6 +91,7 @@
>   
>   from __future__ import absolute_import
>   
> +import codecs
>   import hashlib
>   import os
>   import stat
> @@ -99,6 +100,7 @@
>   from mercurial import (
>       context,
>       encoding,
> +    error,
>       extensions,
>       localrepo,
>       merge,
> @@ -110,6 +112,7 @@
>   from mercurial import match as matchmod
>   
>   from . import (
> +    pywatchman,
>       state,
>       watchmanclient,
>   )
> @@ -159,6 +162,23 @@
>       sha1.update('\0')
>       return sha1.hexdigest()
>   
> +_watchmanencoding = pywatchman.encoding.get_local_encoding()
> +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
> +_fixencoding = codecs.lookup(_watchmanencoding) != codecs.lookup(_fsencoding)
> +
> +def _watchmantofsencoding(path):
> +    """Fix path to match watchman and local filesystem encoding
> +
> +    watchman's paths encoding can differ from filesystem encoding. For example,
> +    on Windows, it's always utf-8.
> +    """
> +    try:
> +        decoded = path.decode(_watchmanencoding)
> +    except UnicodeDecodeError as e:
> +        raise error.Abort(e, hint='watchman encoding error')

Does this need to be str(e)?

> +
> +    return decoded.encode(_fsencoding, 'replace')
> +
>   def overridewalk(orig, self, match, subrepos, unknown, ignored, full=True):
>       '''Replacement for dirstate.walk, hooking into Watchman.
>   
> @@ -303,6 +323,8 @@
>       # for name case changes.
>       for entry in result['files']:
>           fname = entry['name']
> +        if _fixencoding:
> +            fname = _watchmantofsencoding(fname)

This is a critical path IIRC, so I'm a little concerned about 
performance here -- both on Mac/Linux where _fixencoding will (always?) 
be false and on Windows where it will (always?) be true.

Any chance you you could measure the before and after with (say) 10,000 
files in the result set? Thanks!

>           if switch_slashes:
>               fname = fname.replace('\\', '/')
>           if normalize:
Olivier Trempe - April 5, 2017, 8:30 p.m.
On Wed, Apr 5, 2017 at 1:55 PM, Siddharth Agarwal <sid@less-broken.com>
wrote:
>
> On 4/5/17 08:42, Olivier Trempe wrote:
>>
>> # HG changeset patch
>> # User Olivier Trempe <oliviertrempe@gmail.com>
>> # Date 1488981822 18000
>> #      Wed Mar 08 09:03:42 2017 -0500
>> # Branch stable
>> # Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c
>> # Parent  68f263f52d2e3e2798b4f1e55cb665c6b043f93b
>> fsmonitor: match watchman and filesystem encoding
>>
>> watchman's paths encoding can differ from filesystem encoding. For
example,
>> on Windows, it's always utf-8.
>>
>> Before this patch, on Windows, mismatch in path comparison between
fsmonitor
>> state and osutil.statfiles would yield a clean status for added/modified
files.
>>
>> In addition to status reporting wrong results, this leads to files being
>> discarded from changesets while doing history editing operations such as
rebase.
>
>
> This patch looks correct to me, though I have questions about its
performance below.
>
> +cc foozy for another look.
>
>
>>
>> diff --git a/hgext/fsmonitor/__init__.py b/hgext/fsmonitor/__init__.py
>> --- a/hgext/fsmonitor/__init__.py
>> +++ b/hgext/fsmonitor/__init__.py
>> @@ -91,6 +91,7 @@
>>     from __future__ import absolute_import
>>   +import codecs
>>   import hashlib
>>   import os
>>   import stat
>> @@ -99,6 +100,7 @@
>>   from mercurial import (
>>       context,
>>       encoding,
>> +    error,
>>       extensions,
>>       localrepo,
>>       merge,
>> @@ -110,6 +112,7 @@
>>   from mercurial import match as matchmod
>>     from . import (
>> +    pywatchman,
>>       state,
>>       watchmanclient,
>>   )
>> @@ -159,6 +162,23 @@
>>       sha1.update('\0')
>>       return sha1.hexdigest()
>>   +_watchmanencoding = pywatchman.encoding.get_local_encoding()
>> +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
>> +_fixencoding = codecs.lookup(_watchmanencoding) !=
codecs.lookup(_fsencoding)
>> +
>> +def _watchmantofsencoding(path):
>> +    """Fix path to match watchman and local filesystem encoding
>> +
>> +    watchman's paths encoding can differ from filesystem encoding. For
example,
>> +    on Windows, it's always utf-8.
>> +    """
>> +    try:
>> +        decoded = path.decode(_watchmanencoding)
>> +    except UnicodeDecodeError as e:
>> +        raise error.Abort(e, hint='watchman encoding error')
>
>
> Does this need to be str(e)?
>

Absolutely. This will be fixed in V3.
I also realize the "import sys" statement was removed 3 months ago. I
missed it when rebasing... This will also be fixed in V3.

>>
>> +
>> +    return decoded.encode(_fsencoding, 'replace')
>> +
>>   def overridewalk(orig, self, match, subrepos, unknown, ignored,
full=True):
>>       '''Replacement for dirstate.walk, hooking into Watchman.
>>   @@ -303,6 +323,8 @@
>>       # for name case changes.
>>       for entry in result['files']:
>>           fname = entry['name']
>> +        if _fixencoding:
>> +            fname = _watchmantofsencoding(fname)
>
>
> This is a critical path IIRC, so I'm a little concerned about performance
here -- both on Mac/Linux where _fixencoding will (always?) be false and on
Windows where it will (always?) be true.
>
> Any chance you you could measure the before and after with (say) 10,000
files in the result set? Thanks!
>

There is a little overhead at module import:
python -m timeit "import hgext.fsmonitor"
Windows before patch: 1000000 loops, best of 3: 0.563 usec per loop
Windows after patch: 1000000 loops, best of 3: 0.583 usec per loop
Linx before patch: 1000000 loops, best of 3: 0.628 usec per loop
Linux after patch: 1000000 loops, best of 3: 0.579 usec per loop

10000 calls to _watchmantofsencoding:
python -m timeit -s "from hgext.fsmonitor import _watchmantofsencoding,
_fixencoding" "fname = '/path/to/file'" "for i in range(10000):" "    if
_fixencoding: fname = _watchmantofsencoding(fname)"
Windows (_fixencoding is True): 100 loops, best of 3: 19.5 msec per loop
Linux (_fixencoding is False): 100 loops, best of 3: 3.08 msec per loop

Do you want me to include these results in the commit message?

>
>>           if switch_slashes:
>>               fname = fname.replace('\\', '/')
>>           if normalize:
>
>
>
Siddharth Agarwal - April 5, 2017, 8:43 p.m.
On 4/5/17 13:30, Olivier Trempe wrote:
> Absolutely. This will be fixed in V3.
> I also realize the "import sys" statement was removed 3 months ago. I 
> missed it when rebasing... This will also be fixed in V3.

Thanks.

In general, do you think you could add a test for the behavior where 
fsmonitor currently fails? I assume any non-ASCII filenames will cause 
issues.

>
> >>
> >> +
> >> +    return decoded.encode(_fsencoding, 'replace')
> >> +
> >>   def overridewalk(orig, self, match, subrepos, unknown, ignored, 
> full=True):
> >>       '''Replacement for dirstate.walk, hooking into Watchman.
> >>   @@ -303,6 +323,8 @@
> >>       # for name case changes.
> >>       for entry in result['files']:
> >>           fname = entry['name']
> >> +        if _fixencoding:
> >> +            fname = _watchmantofsencoding(fname)
> >
> >
> > This is a critical path IIRC, so I'm a little concerned about 
> performance here -- both on Mac/Linux where _fixencoding will 
> (always?) be false and on Windows where it will (always?) be true.
> >
> > Any chance you you could measure the before and after with (say) 
> 10,000 files in the result set? Thanks!
> >
>
> There is a little overhead at module import:
> python -m timeit "import hgext.fsmonitor"
> Windows before patch: 1000000 loops, best of 3: 0.563 usec per loop
> Windows after patch: 1000000 loops, best of 3: 0.583 usec per loop
> Linx before patch: 1000000 loops, best of 3: 0.628 usec per loop
> Linux after patch: 1000000 loops, best of 3: 0.579 usec per loop
>
> 10000 calls to _watchmantofsencoding:
> python -m timeit -s "from hgext.fsmonitor import 
> _watchmantofsencoding, _fixencoding" "fname = '/path/to/file'" "for i 
> in range(10000):" "    if _fixencoding: fname = 
> _watchmantofsencoding(fname)"
> Windows (_fixencoding is True): 100 loops, best of 3: 19.5 msec per loop
> Linux (_fixencoding is False): 100 loops, best of 3: 3.08 msec per loop
>
> Do you want me to include these results in the commit message?

Yes, that would be great! Thanks.

>
> >
> >>           if switch_slashes:
> >>               fname = fname.replace('\\', '/')
> >>           if normalize:
> >
> >
> >
Yuya Nishihara - April 6, 2017, 1:45 p.m.
On Wed, 5 Apr 2017 10:55:17 -0700, Siddharth Agarwal wrote:
> On 4/5/17 08:42, Olivier Trempe wrote:
> > # HG changeset patch
> > # User Olivier Trempe <oliviertrempe@gmail.com>
> > # Date 1488981822 18000
> > #      Wed Mar 08 09:03:42 2017 -0500
> > # Branch stable
> > # Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c
> > # Parent  68f263f52d2e3e2798b4f1e55cb665c6b043f93b
> > fsmonitor: match watchman and filesystem encoding
> >
> > watchman's paths encoding can differ from filesystem encoding. For example,
> > on Windows, it's always utf-8.
> >
> > Before this patch, on Windows, mismatch in path comparison between fsmonitor
> > state and osutil.statfiles would yield a clean status for added/modified files.
> >
> > In addition to status reporting wrong results, this leads to files being
> > discarded from changesets while doing history editing operations such as rebase.
> 
> This patch looks correct to me, though I have questions about its 
> performance below.
> 
> +cc foozy for another look.

[...]

> > +_watchmanencoding = pywatchman.encoding.get_local_encoding()
> > +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
> > +_fixencoding = codecs.lookup(_watchmanencoding) != codecs.lookup(_fsencoding)
> > +
> > +def _watchmantofsencoding(path):
> > +    """Fix path to match watchman and local filesystem encoding
> > +
> > +    watchman's paths encoding can differ from filesystem encoding. For example,
> > +    on Windows, it's always utf-8.
> > +    """
> > +    try:
> > +        decoded = path.decode(_watchmanencoding)
> > +    except UnicodeDecodeError as e:
> > +        raise error.Abort(e, hint='watchman encoding error')
> 
> Does this need to be str(e)?

Perhaps.
> 
> > +
> > +    return decoded.encode(_fsencoding, 'replace')

Maybe it's better to catch exception here. Encoding error would be more likely
to happen because Windows ANSI charset is generally narrower than UTF-*.
Olivier Trempe - April 6, 2017, 4:46 p.m.
On Thu, Apr 6, 2017 at 9:45 AM, Yuya Nishihara <yuya@tcha.org> wrote:

> On Wed, 5 Apr 2017 10:55:17 -0700, Siddharth Agarwal wrote:
> > On 4/5/17 08:42, Olivier Trempe wrote:
> > > # HG changeset patch
> > > # User Olivier Trempe <oliviertrempe@gmail.com>
> > > # Date 1488981822 18000
> > > #      Wed Mar 08 09:03:42 2017 -0500
> > > # Branch stable
> > > # Node ID 2021c3032968bef6b8d1cd7bea5a22996ced994c
> > > # Parent  68f263f52d2e3e2798b4f1e55cb665c6b043f93b
> > > fsmonitor: match watchman and filesystem encoding
> > >
> > > watchman's paths encoding can differ from filesystem encoding. For
> example,
> > > on Windows, it's always utf-8.
> > >
> > > Before this patch, on Windows, mismatch in path comparison between
> fsmonitor
> > > state and osutil.statfiles would yield a clean status for
> added/modified files.
> > >
> > > In addition to status reporting wrong results, this leads to files
> being
> > > discarded from changesets while doing history editing operations such
> as rebase.
> >
> > This patch looks correct to me, though I have questions about its
> > performance below.
> >
> > +cc foozy for another look.
>
> [...]
>
> > > +_watchmanencoding = pywatchman.encoding.get_local_encoding()
> > > +_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
> > > +_fixencoding = codecs.lookup(_watchmanencoding) !=
> codecs.lookup(_fsencoding)
> > > +
> > > +def _watchmantofsencoding(path):
> > > +    """Fix path to match watchman and local filesystem encoding
> > > +
> > > +    watchman's paths encoding can differ from filesystem encoding.
> For example,
> > > +    on Windows, it's always utf-8.
> > > +    """
> > > +    try:
> > > +        decoded = path.decode(_watchmanencoding)
> > > +    except UnicodeDecodeError as e:
> > > +        raise error.Abort(e, hint='watchman encoding error')
> >
> > Does this need to be str(e)?
>
> Perhaps.
> >
> > > +
> > > +    return decoded.encode(_fsencoding, 'replace')
>
> Maybe it's better to catch exception here. Encoding error would be more
> likely
> to happen because Windows ANSI charset is generally narrower than UTF-*.
>

You mean setting the error handler to 'strict' rather than 'replace' and
wrap the call in a try except block?
Or just wrap the call in a try except block, but keep the 'replace' error
handler?
Using the 'replace' error handler is necessary here to match the behavior
of osutil.listdir
Olivier Trempe - April 6, 2017, 6:42 p.m.
On Wed, Apr 5, 2017 at 4:43 PM, Siddharth Agarwal <sid@less-broken.com>
wrote:

> On 4/5/17 13:30, Olivier Trempe wrote:
>
>> Absolutely. This will be fixed in V3.
>> I also realize the "import sys" statement was removed 3 months ago. I
>> missed it when rebasing... This will also be fixed in V3.
>>
>
> Thanks.
>
> In general, do you think you could add a test for the behavior where
> fsmonitor currently fails? I assume any non-ASCII filenames will cause
> issues.


I just spent some time trying to write a test, but it is a real pain to
make it work on Windows. It's already a feat to run mercurial tests on
Windows. Then adding fsmonitor to this just adds another level of pain. I
won't spend more time writing the test...


>
>> >>
>> >> +
>> >> +    return decoded.encode(_fsencoding, 'replace')
>> >> +
>> >>   def overridewalk(orig, self, match, subrepos, unknown, ignored,
>> full=True):
>> >>       '''Replacement for dirstate.walk, hooking into Watchman.
>> >>   @@ -303,6 +323,8 @@
>> >>       # for name case changes.
>> >>       for entry in result['files']:
>> >>           fname = entry['name']
>> >> +        if _fixencoding:
>> >> +            fname = _watchmantofsencoding(fname)
>> >
>> >
>> > This is a critical path IIRC, so I'm a little concerned about
>> performance here -- both on Mac/Linux where _fixencoding will (always?) be
>> false and on Windows where it will (always?) be true.
>> >
>> > Any chance you you could measure the before and after with (say) 10,000
>> files in the result set? Thanks!
>> >
>>
>> There is a little overhead at module import:
>> python -m timeit "import hgext.fsmonitor"
>> Windows before patch: 1000000 loops, best of 3: 0.563 usec per loop
>> Windows after patch: 1000000 loops, best of 3: 0.583 usec per loop
>> Linx before patch: 1000000 loops, best of 3: 0.628 usec per loop
>> Linux after patch: 1000000 loops, best of 3: 0.579 usec per loop
>>
>> 10000 calls to _watchmantofsencoding:
>> python -m timeit -s "from hgext.fsmonitor import _watchmantofsencoding,
>> _fixencoding" "fname = '/path/to/file'" "for i in range(10000):" "    if
>> _fixencoding: fname = _watchmantofsencoding(fname)"
>> Windows (_fixencoding is True): 100 loops, best of 3: 19.5 msec per loop
>> Linux (_fixencoding is False): 100 loops, best of 3: 3.08 msec per loop
>>
>> Do you want me to include these results in the commit message?
>>
>
> Yes, that would be great! Thanks.
>
>
>
>> >
>> >>           if switch_slashes:
>> >>               fname = fname.replace('\\', '/')
>> >>           if normalize:
>> >
>> >
>> >
>>
>
>
>
Yuya Nishihara - April 7, 2017, 12:54 p.m.
On Thu, 6 Apr 2017 12:46:38 -0400, Olivier Trempe wrote:
> On Thu, Apr 6, 2017 at 9:45 AM, Yuya Nishihara <yuya@tcha.org> wrote:
> > > > +def _watchmantofsencoding(path):
> > > > +    """Fix path to match watchman and local filesystem encoding
> > > > +
> > > > +    watchman's paths encoding can differ from filesystem encoding.
> > For example,
> > > > +    on Windows, it's always utf-8.
> > > > +    """
> > > > +    try:
> > > > +        decoded = path.decode(_watchmanencoding)
> > > > +    except UnicodeDecodeError as e:
> > > > +        raise error.Abort(e, hint='watchman encoding error')
> > >
> > > Does this need to be str(e)?
> >
> > Perhaps.
> > >
> > > > +
> > > > +    return decoded.encode(_fsencoding, 'replace')
> >
> > Maybe it's better to catch exception here. Encoding error would be more
> > likely
> > to happen because Windows ANSI charset is generally narrower than UTF-*.
> >
> 
> You mean setting the error handler to 'strict' rather than 'replace' and
> wrap the call in a try except block?

Yes.

> Or just wrap the call in a try except block, but keep the 'replace' error
> handler?
> Using the 'replace' error handler is necessary here to match the behavior
> of osutil.listdir

It appears 'mbcs' codec replaces unknown characters no matter if 'strict' is
specified or not. Perhaps that would be done by WideCharToMultiByte(). I think
using 'strict' here is more consistent because osutil.listdir() handles
nothing about encoding in Python layer.

https://msdn.microsoft.com/en-us/library/windows/desktop/dd374130(v=vs.85).aspx

Patch

diff --git a/hgext/fsmonitor/__init__.py b/hgext/fsmonitor/__init__.py
--- a/hgext/fsmonitor/__init__.py
+++ b/hgext/fsmonitor/__init__.py
@@ -91,6 +91,7 @@ 
 
 from __future__ import absolute_import
 
+import codecs
 import hashlib
 import os
 import stat
@@ -99,6 +100,7 @@ 
 from mercurial import (
     context,
     encoding,
+    error,
     extensions,
     localrepo,
     merge,
@@ -110,6 +112,7 @@ 
 from mercurial import match as matchmod
 
 from . import (
+    pywatchman,
     state,
     watchmanclient,
 )
@@ -159,6 +162,23 @@ 
     sha1.update('\0')
     return sha1.hexdigest()
 
+_watchmanencoding = pywatchman.encoding.get_local_encoding()
+_fsencoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
+_fixencoding = codecs.lookup(_watchmanencoding) != codecs.lookup(_fsencoding)
+
+def _watchmantofsencoding(path):
+    """Fix path to match watchman and local filesystem encoding
+
+    watchman's paths encoding can differ from filesystem encoding. For example,
+    on Windows, it's always utf-8.
+    """
+    try:
+        decoded = path.decode(_watchmanencoding)
+    except UnicodeDecodeError as e:
+        raise error.Abort(e, hint='watchman encoding error')
+
+    return decoded.encode(_fsencoding, 'replace')
+
 def overridewalk(orig, self, match, subrepos, unknown, ignored, full=True):
     '''Replacement for dirstate.walk, hooking into Watchman.
 
@@ -303,6 +323,8 @@ 
     # for name case changes.
     for entry in result['files']:
         fname = entry['name']
+        if _fixencoding:
+            fname = _watchmantofsencoding(fname)
         if switch_slashes:
             fname = fname.replace('\\', '/')
         if normalize: