Patchwork [03,of,15,v2] dirstate: create class for status lists

login
register
mail settings
Submitter Martin von Zweigbergk
Date Oct. 5, 2014, 6:08 a.m.
Message ID <925b7bbb3a5444a6cf67.1412489280@handduk2.mtv.corp.google.com>
Download mbox | patch
Permalink /patch/6129/
State Changes Requested
Headers show

Comments

Martin von Zweigbergk - Oct. 5, 2014, 6:08 a.m.
# HG changeset patch
# User Martin von Zweigbergk <martinvonz@gmail.com>
# Date 1412480848 25200
#      Sat Oct 04 20:47:28 2014 -0700
# Node ID 925b7bbb3a5444a6cf67052d80bff483d5c19a4b
# Parent  065114309f6206ec11a48dd088c287211467620b
dirstate: create class for status lists

Callers of various status() methods (on dirstate, context, repo) get a
tuple of 7 elements, where each element is a list of files. This
results in lots of uses of indexes where names would be much more
readable. For example, "status.ignored" seems clearer than "status[4]"
[1]. So, let's introduce a simple named tuple containing the 7 status
fields: modified, added, removed, deleted, unknown, ignored, clean.

This patch introduces the class and updates the status methods to
return instances of it. Later patches will update the callers.

 [1] Did you even notice that it should have been "status[5]"?
Mads Kiilerich - Oct. 6, 2014, 6:20 p.m.
On 10/05/2014 08:08 AM, Martin von Zweigbergk wrote:
> # HG changeset patch
> # User Martin von Zweigbergk <martinvonz@gmail.com>
> # Date 1412480848 25200
> #      Sat Oct 04 20:47:28 2014 -0700
> # Node ID 925b7bbb3a5444a6cf67052d80bff483d5c19a4b
> # Parent  065114309f6206ec11a48dd088c287211467620b
> dirstate: create class for status lists
>
> Callers of various status() methods (on dirstate, context, repo) get a
> tuple of 7 elements, where each element is a list of files. This
> results in lots of uses of indexes where names would be much more
> readable. For example, "status.ignored" seems clearer than "status[4]"
> [1]. So, let's introduce a simple named tuple containing the 7 status
> fields: modified, added, removed, deleted, unknown, ignored, clean.
>
> This patch introduces the class and updates the status methods to
> return instances of it. Later patches will update the callers.
>
>   [1] Did you even notice that it should have been "status[5]"?
>
> diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
> --- a/hgext/largefiles/overrides.py
> +++ b/hgext/largefiles/overrides.py
> @@ -11,6 +11,7 @@
>   import os
>   import copy
>   
> +from mercurial import dirstate
>   from mercurial import hg, commands, util, cmdutil, scmutil, match as match_, \
>           archival, pathutil, revset
>   from mercurial.i18n import _
> @@ -1149,7 +1150,8 @@
>           modified, added, removed, deleted, unknown, ignored, clean = r
>           unknown = [f for f in unknown if lfdirstate[f] == '?']
>           ignored = [f for f in ignored if lfdirstate[f] == '?']
> -        return modified, added, removed, deleted, unknown, ignored, clean
> +        return dirstate.status(modified, added, removed, deleted, unknown,
> +                               ignored, clean)
>       repo.status = overridestatus
>       orig(ui, repo, *dirs, **opts)
>       repo.status = oldstatus
> diff --git a/hgext/largefiles/reposetup.py b/hgext/largefiles/reposetup.py
> --- a/hgext/largefiles/reposetup.py
> +++ b/hgext/largefiles/reposetup.py
> @@ -10,6 +10,7 @@
>   import copy
>   import os
>   
> +from mercurial import dirstate
>   from mercurial import error, manifest, match as match_, util
>   from mercurial.i18n import _
>   from mercurial import localrepo
> @@ -242,7 +243,7 @@
>                       wlock.release()
>   
>               self.lfstatus = True
> -            return result
> +            return dirstate.status(*result)
>   
>           # As part of committing, copy all of the largefiles into the
>           # cache.
> diff --git a/mercurial/context.py b/mercurial/context.py
> --- a/mercurial/context.py
> +++ b/mercurial/context.py
> @@ -7,6 +7,7 @@
>   
>   from node import nullid, nullrev, short, hex, bin
>   from i18n import _
> +import dirstate
>   import mdiff, error, util, scmutil, subrepo, patch, encoding, phases
>   import match as matchmod
>   import os, errno, stat
> @@ -340,8 +341,7 @@
>           for l in r:
>               l.sort()
>   
> -        # we return a tuple to signify that this list isn't changing
> -        return tuple(r)
> +        return dirstate.status(*r)
>   
>   
>   def makememctx(repo, parents, text, user, date, branch, files, store,
> @@ -1482,7 +1482,7 @@
>           # (s[1] is 'added' and s[2] is 'removed')
>           s = list(s)
>           s[1], s[2] = s[2], s[1]
> -        return tuple(s)
> +        return dirstate.status(*s)
>   
>   class committablefilectx(basefilectx):
>       """A committablefilectx provides common functionality for a file context
> diff --git a/mercurial/dirstate.py b/mercurial/dirstate.py
> --- a/mercurial/dirstate.py
> +++ b/mercurial/dirstate.py
> @@ -26,6 +26,49 @@
>       def join(self, obj, fname):
>           return obj._join(fname)
>   
> +class status(tuple):
> +    '''Named tuple with a list of files per status.
> +    '''

Hmm. There are several things I would like to point out ... but they 
seem to all be inherited from the Python namedtuple. It might make sense 
to stay as similar as possible but slightly different.

Why not use a generic named tuple implementation ... especially on 
Python 2.6+ where it is built-in? There might be good reasons, but they 
should documented.

I guess one reason is that the standard implementation creates the class 
as source code and execs it. -1 for elegance, +1 for getting the work done.

Another reason is that this implementation defaults to having 
unspecified parameters as empty lists ... but is that really relevant / 
necessary?

> +
> +    __slots__ = ()
> +
> +    def __new__(cls, modified=[], added=[], removed=[], deleted=[], unknown=[],
> +                ignored=[], clean=[]):

http://docs.python-guide.org/en/latest/writing/gotchas/#mutable-default-arguments 
- this is the only significant issue I see with this series.

> +        return tuple.__new__(cls, (modified, added, removed, deleted, unknown,
> +                ignored, clean))
> +
> +    @property
> +    def modified(self):
> +        return self[0]

I think I would prefer to store the members by name (in slots?) and 
avoid doing the array index lookup. That would make it more like an 
object that behaved like a tuple than the opposite.

But these status objects should however never be created in tight loops 
anyway, so these (and other) optimizations might be premature.

If we really want to optimize, it could perhaps make sense to use our 
propertycache ... but I don't know where on this slippery lane we want 
to stop.

> +
> +    @property
> +    def added(self):
> +        return self[1]
> +
> +    @property
> +    def removed(self):
> +        return self[2]
> +
> +    @property
> +    def deleted(self):
> +        return self[3]
> +
> +    @property
> +    def unknown(self):
> +        return self[4]
> +
> +    @property
> +    def ignored(self):
> +        return self[5]
> +
> +    @property
> +    def clean(self):
> +        return self[6]
> +
> +    def __repr__(self, *args, **kwargs):
> +        return (('status(modified=%r, added=%r, removed=%r, deleted=%r, ' +
> +                 'unknown=%r, ignored=%r, clean=%r)') % self)

repr usually looks like '<status yada yada>'  - I would expect this to 
follow the same pattern. But ok, this is how Python does it ...

/Mads
Yuya Nishihara - Oct. 7, 2014, 12:05 p.m.
On Mon, 06 Oct 2014 20:20:07 +0200, Mads Kiilerich wrote:
> On 10/05/2014 08:08 AM, Martin von Zweigbergk wrote:
> > +class status(tuple):
> > +    '''Named tuple with a list of files per status.
> > +    '''
> 
> Hmm. There are several things I would like to point out ... but they 
> seem to all be inherited from the Python namedtuple. It might make sense 
> to stay as similar as possible but slightly different.
> 
> Why not use a generic named tuple implementation ... especially on 
> Python 2.6+ where it is built-in? There might be good reasons, but they 
> should documented.
> 
> I guess one reason is that the standard implementation creates the class 
> as source code and execs it. -1 for elegance, +1 for getting the work done.
> 
> Another reason is that this implementation defaults to having 
> unspecified parameters as empty lists ... but is that really relevant / 
> necessary?
> 
> > +
> > +    __slots__ = ()
> > +
> > +    def __new__(cls, modified=[], added=[], removed=[], deleted=[], unknown=[],
> > +                ignored=[], clean=[]):
> 
> http://docs.python-guide.org/en/latest/writing/gotchas/#mutable-default-arguments 
> - this is the only significant issue I see with this series.
> 
> > +        return tuple.__new__(cls, (modified, added, removed, deleted, unknown,
> > +                ignored, clean))
> > +
> > +    @property
> > +    def modified(self):
> > +        return self[0]
> 
> I think I would prefer to store the members by name (in slots?) and 
> avoid doing the array index lookup. That would make it more like an 
> object that behaved like a tuple than the opposite.

tuple can't have non-empty slot, so it needs __dict__ in order to store members
by name.

property access:

% python -m timeit -s 'class status(tuple):
    __slots__ = ()
    def __new__(cls, modified):
        return tuple.__new__(cls, (modified,))
    @property
    def modified(self):
        return self[0]
o = status([])' 'o.modified'
10000000 loops, best of 3: 0.153 usec per loop

named variable in __dict__:

% python -m timeit -s 'class status(tuple):
    def __new__(cls, modified):
        o = tuple.__new__(cls, (modified,))
        o.modified = modified
        return o
o = status([])' 'o.modified'
10000000 loops, best of 3: 0.0361 usec per loop

direct index access:

% python -m timeit -s 'class status(tuple):
    __slots__ = ()
    def __new__(cls, modified):
        return tuple.__new__(cls, (modified,))
    @property
    def modified(self):
        return self[0]
o = status([])' 'o[0]'
10000000 loops, best of 3: 0.0336 usec per loop

I don't know if this difference matters.

Regards,
Martin von Zweigbergk - Oct. 7, 2014, 8:27 p.m.
On Tue, Oct 7, 2014 at 5:05 AM, Yuya Nishihara <yuya@tcha.org> wrote:
> On Mon, 06 Oct 2014 20:20:07 +0200, Mads Kiilerich wrote:
>> On 10/05/2014 08:08 AM, Martin von Zweigbergk wrote:
>> > +class status(tuple):
>> > +    '''Named tuple with a list of files per status.
>> > +    '''
>>
>> Hmm. There are several things I would like to point out ... but they
>> seem to all be inherited from the Python namedtuple. It might make sense
>> to stay as similar as possible but slightly different.
>>
>> Why not use a generic named tuple implementation ... especially on
>> Python 2.6+ where it is built-in? There might be good reasons, but they
>> should documented.
>>
>> I guess one reason is that the standard implementation creates the class
>> as source code and execs it. -1 for elegance, +1 for getting the work done.
>>
>> Another reason is that this implementation defaults to having
>> unspecified parameters as empty lists ... but is that really relevant /
>> necessary?
>>
>> > +
>> > +    __slots__ = ()
>> > +
>> > +    def __new__(cls, modified=[], added=[], removed=[], deleted=[], unknown=[],
>> > +                ignored=[], clean=[]):
>>
>> http://docs.python-guide.org/en/latest/writing/gotchas/#mutable-default-arguments
>> - this is the only significant issue I see with this series.
>>
>> > +        return tuple.__new__(cls, (modified, added, removed, deleted, unknown,
>> > +                ignored, clean))
>> > +
>> > +    @property
>> > +    def modified(self):
>> > +        return self[0]
>>
>> I think I would prefer to store the members by name (in slots?) and
>> avoid doing the array index lookup. That would make it more like an
>> object that behaved like a tuple than the opposite.
>
> tuple can't have non-empty slot, so it needs __dict__ in order to store members
> by name.
>
> property access:
>
> % python -m timeit -s 'class status(tuple):
>     __slots__ = ()
>     def __new__(cls, modified):
>         return tuple.__new__(cls, (modified,))
>     @property
>     def modified(self):
>         return self[0]
> o = status([])' 'o.modified'
> 10000000 loops, best of 3: 0.153 usec per loop
>
> named variable in __dict__:
>
> % python -m timeit -s 'class status(tuple):
>     def __new__(cls, modified):
>         o = tuple.__new__(cls, (modified,))
>         o.modified = modified
>         return o
> o = status([])' 'o.modified'
> 10000000 loops, best of 3: 0.0361 usec per loop
>
> direct index access:
>
> % python -m timeit -s 'class status(tuple):
>     __slots__ = ()
>     def __new__(cls, modified):
>         return tuple.__new__(cls, (modified,))
>     @property
>     def modified(self):
>         return self[0]
> o = status([])' 'o[0]'
> 10000000 loops, best of 3: 0.0336 usec per loop
>
> I don't know if this difference matters.

Thanks for checking. I doubt that last difference matters. I doubt
even the the slowness of the first version matters, but I don't mind
changing to whichever form others think is the best balance between
brevity and speed.
Martin von Zweigbergk - Oct. 10, 2014, 5:21 p.m.
On Mon, Oct 6, 2014 at 11:20 AM, Mads Kiilerich <mads@kiilerich.com> wrote:
> On 10/05/2014 08:08 AM, Martin von Zweigbergk wrote:
>>
>> @@ -26,6 +26,49 @@
>>       def join(self, obj, fname):
>>           return obj._join(fname)
>>   +class status(tuple):
>> +    '''Named tuple with a list of files per status.
>> +    '''
>
>
> Hmm. There are several things I would like to point out ... but they seem to
> all be inherited from the Python namedtuple. It might make sense to stay as
> similar as possible but slightly different.

Several things you would like to point out in the docstring? I know
the docstring is very short, but I also don't know what else to put
there. I considered moving the descriptions for the different statuses
from the dirstate.status() method. I might do that in v3.

> Why not use a generic named tuple implementation ... especially on Python
> 2.6+ where it is built-in? There might be good reasons, but they should
> documented.

We have to support Python 2.4-2.5 too, right? Would I have to check
the version (or existence of named_tuple)? Or back-port it? It just
sounds like extra code for little benefit to me, so please explain to
a poor Python newbie.

> I guess one reason is that the standard implementation creates the class as
> source code and execs it. -1 for elegance, +1 for getting the work done.
>
> Another reason is that this implementation defaults to having unspecified
> parameters as empty lists ... but is that really relevant / necessary?
>
>> +
>> +    __slots__ = ()
>> +
>> +    def __new__(cls, modified=[], added=[], removed=[], deleted=[],
>> unknown=[],
>> +                ignored=[], clean=[]):
>
>
> http://docs.python-guide.org/en/latest/writing/gotchas/#mutable-default-arguments
> - this is the only significant issue I see with this series.

Wow, that's fun. Will use None as default instead.

>> +        return tuple.__new__(cls, (modified, added, removed, deleted,
>> unknown,
>> +                ignored, clean))
>> +
>> +    @property
>> +    def modified(self):
>> +        return self[0]
>
>
> I think I would prefer to store the members by name (in slots?) and avoid
> doing the array index lookup. That would make it more like an object that
> behaved like a tuple than the opposite.
>
> But these status objects should however never be created in tight loops
> anyway, so these (and other) optimizations might be premature.

That's what I think, as I said in reply to Yuya's email. Since I heard
no complaints, I'll leave it as is until we've seen that it does
matter.

>> +    def __repr__(self, *args, **kwargs):
>> +        return (('status(modified=%r, added=%r, removed=%r, deleted=%r, '
>> +
>> +                 'unknown=%r, ignored=%r, clean=%r)') % self)
>
>
> repr usually looks like '<status yada yada>'  - I would expect this to
> follow the same pattern. But ok, this is how Python does it ...

Will change.
Mads Kiilerich - Oct. 10, 2014, 5:27 p.m.
On 10/10/2014 07:21 PM, Martin von Zweigbergk wrote:
> On Mon, Oct 6, 2014 at 11:20 AM, Mads Kiilerich <mads@kiilerich.com> wrote:
>> On 10/05/2014 08:08 AM, Martin von Zweigbergk wrote:
>>> @@ -26,6 +26,49 @@
>>>        def join(self, obj, fname):
>>>            return obj._join(fname)
>>>    +class status(tuple):
>>> +    '''Named tuple with a list of files per status.
>>> +    '''
>>
>> Hmm. There are several things I would like to point out ... but they seem to
>> all be inherited from the Python namedtuple. It might make sense to stay as
>> similar as possible but slightly different.
> Several things you would like to point out in the docstring?

No - about the whole use of named tuple. As you can see, I don't know 
exactly what my opinion is and my attempts at making an opinion are 
inconsistent. But I hope it adds value to raise the questions ;-)


>> Why not use a generic named tuple implementation ... especially on Python
>> 2.6+ where it is built-in? There might be good reasons, but they should
>> documented.
> We have to support Python 2.4-2.5 too, right? Would I have to check
> the version (or existence of named_tuple)? Or back-port it? It just
> sounds like extra code for little benefit to me, so please explain to
> a poor Python newbie.

It would perhaps be nice if we automatically could start using the 
standard named tuple _when_ we drop Python 2.4-2.5. But I realize the 
2.6 named tuple not is so elegant. Having it explicit might be more 
elegant. But as mentioned on IRC, I wonder if something simpler would be 
"better".

/Mads

Patch

diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
--- a/hgext/largefiles/overrides.py
+++ b/hgext/largefiles/overrides.py
@@ -11,6 +11,7 @@ 
 import os
 import copy
 
+from mercurial import dirstate
 from mercurial import hg, commands, util, cmdutil, scmutil, match as match_, \
         archival, pathutil, revset
 from mercurial.i18n import _
@@ -1149,7 +1150,8 @@ 
         modified, added, removed, deleted, unknown, ignored, clean = r
         unknown = [f for f in unknown if lfdirstate[f] == '?']
         ignored = [f for f in ignored if lfdirstate[f] == '?']
-        return modified, added, removed, deleted, unknown, ignored, clean
+        return dirstate.status(modified, added, removed, deleted, unknown,
+                               ignored, clean)
     repo.status = overridestatus
     orig(ui, repo, *dirs, **opts)
     repo.status = oldstatus
diff --git a/hgext/largefiles/reposetup.py b/hgext/largefiles/reposetup.py
--- a/hgext/largefiles/reposetup.py
+++ b/hgext/largefiles/reposetup.py
@@ -10,6 +10,7 @@ 
 import copy
 import os
 
+from mercurial import dirstate
 from mercurial import error, manifest, match as match_, util
 from mercurial.i18n import _
 from mercurial import localrepo
@@ -242,7 +243,7 @@ 
                     wlock.release()
 
             self.lfstatus = True
-            return result
+            return dirstate.status(*result)
 
         # As part of committing, copy all of the largefiles into the
         # cache.
diff --git a/mercurial/context.py b/mercurial/context.py
--- a/mercurial/context.py
+++ b/mercurial/context.py
@@ -7,6 +7,7 @@ 
 
 from node import nullid, nullrev, short, hex, bin
 from i18n import _
+import dirstate
 import mdiff, error, util, scmutil, subrepo, patch, encoding, phases
 import match as matchmod
 import os, errno, stat
@@ -340,8 +341,7 @@ 
         for l in r:
             l.sort()
 
-        # we return a tuple to signify that this list isn't changing
-        return tuple(r)
+        return dirstate.status(*r)
 
 
 def makememctx(repo, parents, text, user, date, branch, files, store,
@@ -1482,7 +1482,7 @@ 
         # (s[1] is 'added' and s[2] is 'removed')
         s = list(s)
         s[1], s[2] = s[2], s[1]
-        return tuple(s)
+        return dirstate.status(*s)
 
 class committablefilectx(basefilectx):
     """A committablefilectx provides common functionality for a file context
diff --git a/mercurial/dirstate.py b/mercurial/dirstate.py
--- a/mercurial/dirstate.py
+++ b/mercurial/dirstate.py
@@ -26,6 +26,49 @@ 
     def join(self, obj, fname):
         return obj._join(fname)
 
+class status(tuple):
+    '''Named tuple with a list of files per status.
+    '''
+
+    __slots__ = ()
+
+    def __new__(cls, modified=[], added=[], removed=[], deleted=[], unknown=[],
+                ignored=[], clean=[]):
+        return tuple.__new__(cls, (modified, added, removed, deleted, unknown,
+                ignored, clean))
+
+    @property
+    def modified(self):
+        return self[0]
+
+    @property
+    def added(self):
+        return self[1]
+
+    @property
+    def removed(self):
+        return self[2]
+
+    @property
+    def deleted(self):
+        return self[3]
+
+    @property
+    def unknown(self):
+        return self[4]
+
+    @property
+    def ignored(self):
+        return self[5]
+
+    @property
+    def clean(self):
+        return self[6]
+
+    def __repr__(self, *args, **kwargs):
+        return (('status(modified=%r, added=%r, removed=%r, deleted=%r, ' +
+                 'unknown=%r, ignored=%r, clean=%r)') % self)
+
 class dirstate(object):
 
     def __init__(self, opener, ui, root, validate):
@@ -900,8 +943,8 @@ 
             elif state == 'r':
                 radd(fn)
 
-        return (lookup, (modified, added, removed, deleted, unknown, ignored,
-                         clean))
+        return (lookup, status(modified, added, removed, deleted, unknown,
+                               ignored, clean))
 
     def matches(self, match):
         '''
diff --git a/mercurial/subrepo.py b/mercurial/subrepo.py
--- a/mercurial/subrepo.py
+++ b/mercurial/subrepo.py
@@ -5,6 +5,7 @@ 
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.
 
+import dirstate
 import errno, os, re, shutil, posixpath, sys
 import xml.dom.minidom
 import stat, subprocess, tarfile
@@ -448,7 +449,7 @@ 
         return 1
 
     def status(self, rev2, **opts):
-        return [], [], [], [], [], [], []
+        return dirstate.status()
 
     def diff(self, ui, diffopts, node2, match, prefix, **opts):
         pass
@@ -650,7 +651,7 @@ 
         except error.RepoLookupError, inst:
             self._repo.ui.warn(_('warning: error "%s" in subrepository "%s"\n')
                                % (inst, subrelpath(self)))
-            return [], [], [], [], [], [], []
+            return dirstate.status()
 
     @annotatesubrepoerror
     def diff(self, ui, diffopts, node2, match, prefix, **opts):
@@ -1562,7 +1563,7 @@ 
         rev1 = self._state[1]
         if self._gitmissing() or not rev1:
             # if the repo is missing, return no results
-            return [], [], [], [], [], [], []
+            return dirstate.status()
         modified, added, removed = [], [], []
         self._gitupdatestat()
         if rev2:
@@ -1582,8 +1583,7 @@ 
             elif status == 'D':
                 removed.append(f)
 
-        deleted = unknown = ignored = clean = []
-        return modified, added, removed, deleted, unknown, ignored, clean
+        return dirstate.status(modified, added, removed)
 
     def shortid(self, revid):
         return revid[:7]