Patchwork [01,of,10] largefiles: centralize the logic to get outgoing largefiles

login
register
mail settings
Submitter Katsunori FUJIWARA
Date March 5, 2014, 12:12 p.m.
Message ID <655cacfc3311c174c995.1394021526@juju>
Download mbox | patch
Permalink /patch/3848/
State Superseded
Headers show

Comments

Katsunori FUJIWARA - March 5, 2014, 12:12 p.m.
# HG changeset patch
# User FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
# Date 1394019743 -32400
#      Wed Mar 05 20:42:23 2014 +0900
# Node ID 655cacfc3311c174c995e4bab13755b498cb43dc
# Parent  779ceb84f4f782d32dfe47f6684107c08d2f6142
largefiles: centralize the logic to get outgoing largefiles

Before this patch, "overrides.getoutgoinglfiles()" (called by
"overrideoutgoing()" and "overridesummary()") and "lfilesrepo.push()"
implement similar logic to get outgoing largefiles separately.

This patch centralizes the logic to get outgoing largefiles in
"lfutil.getlfilestoupload()".
Mads Kiilerich - March 6, 2014, 2:48 p.m.
Thanks for these patches. They are a nice improvement.

On 03/05/2014 01:12 PM, FUJIWARA Katsunori wrote:
> # HG changeset patch
> # User FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
> # Date 1394019743 -32400
> #      Wed Mar 05 20:42:23 2014 +0900
> # Node ID 655cacfc3311c174c995e4bab13755b498cb43dc
> # Parent  779ceb84f4f782d32dfe47f6684107c08d2f6142
> largefiles: centralize the logic to get outgoing largefiles
>
> Before this patch, "overrides.getoutgoinglfiles()" (called by
> "overrideoutgoing()" and "overridesummary()") and "lfilesrepo.push()"
> implement similar logic to get outgoing largefiles separately.

Please elaborate a bit. It took me some time to figure the following out 
(feel free to use it):

The difference between these two were that one returned the name of the 
standin across all revisoins, the other returned the largefile hashes 
across all standins.

> This patch centralizes the logic to get outgoing largefiles in
> "lfutil.getlfilestoupload()".
>
> diff --git a/hgext/largefiles/lfutil.py b/hgext/largefiles/lfutil.py
> --- a/hgext/largefiles/lfutil.py
> +++ b/hgext/largefiles/lfutil.py
> @@ -15,6 +15,7 @@
>   
>   from mercurial import dirstate, httpconnection, match as match_, util, scmutil
>   from mercurial.i18n import _
> +from mercurial import node

Please extend the existing 'from mercurial import' list ... or at least 
group them together.

>   shortname = '.hglf'
>   shortnameslash = shortname + '/'
> @@ -365,3 +366,31 @@
>           if f[0] not in filelist:
>               filelist.append(f[0])
>       return filelist
> +
> +def getlfilestoupload(repo, missing, mapfunc=None):
> +    toupload = set()
> +    if not missing:
> +        return toupload

Do this check make any difference? Looping an empty list is free.

> +    if not mapfunc:
> +        mapfunc = lambda f, ctx: f

I not think this mapfunc thing is pretty.

In the end, it would be nice to tell in the UI which largefile we are 
pulling/pushing. Not only the hash, but also the largefile name (or one 
of them).

Currently outgoing and summary will list all largefiles in outgoing 
changesets, no matter if they really need pushing or not. Arguably it 
should only list the largefiles that really are outgoing. For that we 
need the hashes.

So how about letting this function return a dict mapping from hash to a 
filename that uses it? For now the two users of this function can just 
look at either keys or values.

/Mads

> +    for n in missing:
> +        parents = [p for p in repo.changelog.parents(n) if p != node.nullid]
> +        ctx = repo[n]
> +        files = set(ctx.files())
> +        if len(parents) == 2:
> +            mc = ctx.manifest()
> +            mp1 = ctx.parents()[0].manifest()
> +            mp2 = ctx.parents()[1].manifest()
> +            for f in mp1:
> +                if f not in mc:
> +                    files.add(f)
> +            for f in mp2:
> +                if f not in mc:
> +                    files.add(f)
> +            for f in mc:
> +                if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f, None):
> +                    files.add(f)
> +        toupload = toupload.union(set([mapfunc(f, ctx)
> +                                       for f in files
> +                                       if isstandin(f) and f in ctx]))
> +    return toupload
> diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
> --- a/hgext/largefiles/overrides.py
> +++ b/hgext/largefiles/overrides.py
> @@ -12,7 +12,7 @@
>   import copy
>   
>   from mercurial import hg, commands, util, cmdutil, scmutil, match as match_, \
> -        node, archival, error, merge, discovery, pathutil, revset
> +        archival, error, merge, discovery, pathutil, revset
>   from mercurial.i18n import _
>   from mercurial.node import hex
>   from hgext import rebase
> @@ -999,27 +999,7 @@
>       if opts.get('newest_first'):
>           o.reverse()
>   
> -    toupload = set()
> -    for n in o:
> -        parents = [p for p in repo.changelog.parents(n) if p != node.nullid]
> -        ctx = repo[n]
> -        files = set(ctx.files())
> -        if len(parents) == 2:
> -            mc = ctx.manifest()
> -            mp1 = ctx.parents()[0].manifest()
> -            mp2 = ctx.parents()[1].manifest()
> -            for f in mp1:
> -                if f not in mc:
> -                        files.add(f)
> -            for f in mp2:
> -                if f not in mc:
> -                    files.add(f)
> -            for f in mc:
> -                if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f, None):
> -                    files.add(f)
> -        toupload = toupload.union(
> -            set([f for f in files if lfutil.isstandin(f) and f in ctx]))
> -    return sorted(toupload)
> +    return sorted(lfutil.getlfilestoupload(repo, o))
>   
>   def overrideoutgoing(orig, ui, repo, dest=None, **opts):
>       result = orig(ui, repo, dest, **opts)
> diff --git a/hgext/largefiles/reposetup.py b/hgext/largefiles/reposetup.py
> --- a/hgext/largefiles/reposetup.py
> +++ b/hgext/largefiles/reposetup.py
> @@ -11,7 +11,6 @@
>   import os
>   
>   from mercurial import error, manifest, match as match_, util, discovery
> -from mercurial import node as node_
>   from mercurial.i18n import _
>   from mercurial import localrepo
>   
> @@ -418,32 +417,9 @@
>               outgoing = discovery.findcommonoutgoing(repo, remote.peer(),
>                                                       force=force)
>               if outgoing.missing:
> -                toupload = set()
>                   o = self.changelog.nodesbetween(outgoing.missing, revs)[0]
> -                for n in o:
> -                    parents = [p for p in self.changelog.parents(n)
> -                               if p != node_.nullid]
> -                    ctx = self[n]
> -                    files = set(ctx.files())
> -                    if len(parents) == 2:
> -                        mc = ctx.manifest()
> -                        mp1 = ctx.parents()[0].manifest()
> -                        mp2 = ctx.parents()[1].manifest()
> -                        for f in mp1:
> -                            if f not in mc:
> -                                files.add(f)
> -                        for f in mp2:
> -                            if f not in mc:
> -                                files.add(f)
> -                        for f in mc:
> -                            if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f,
> -                                    None):
> -                                files.add(f)
> -
> -                    toupload = toupload.union(
> -                        set([ctx[f].data().strip()
> -                             for f in files
> -                             if lfutil.isstandin(f) and f in ctx]))
> +                mapfunc = lambda f, ctx: ctx[f].data().strip()
> +                toupload = lfutil.getlfilestoupload(self, o, mapfunc)
>                   lfcommands.uploadlfiles(ui, self, remote, toupload)
>               return super(lfilesrepo, self).push(remote, force=force, revs=revs,
>                   newbranch=newbranch)
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
Katsunori FUJIWARA - March 7, 2014, 2:47 p.m.
At Thu, 06 Mar 2014 15:48:20 +0100,
Mads Kiilerich wrote:
> 
> Thanks for these patches. They are a nice improvement.
> 
> On 03/05/2014 01:12 PM, FUJIWARA Katsunori wrote:
> > # HG changeset patch
> > # User FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
> > # Date 1394019743 -32400
> > #      Wed Mar 05 20:42:23 2014 +0900
> > # Node ID 655cacfc3311c174c995e4bab13755b498cb43dc
> > # Parent  779ceb84f4f782d32dfe47f6684107c08d2f6142
> > largefiles: centralize the logic to get outgoing largefiles
> >
> > Before this patch, "overrides.getoutgoinglfiles()" (called by
> > "overrideoutgoing()" and "overridesummary()") and "lfilesrepo.push()"
> > implement similar logic to get outgoing largefiles separately.
> 
> Please elaborate a bit. It took me some time to figure the following out 
> (feel free to use it):
> 
> The difference between these two were that one returned the name of the 
> standin across all revisoins, the other returned the largefile hashes 
> across all standins.

I'll try to explain more clearly.


> > This patch centralizes the logic to get outgoing largefiles in
> > "lfutil.getlfilestoupload()".
> >
> > diff --git a/hgext/largefiles/lfutil.py b/hgext/largefiles/lfutil.py
> > --- a/hgext/largefiles/lfutil.py
> > +++ b/hgext/largefiles/lfutil.py
> > @@ -15,6 +15,7 @@
> >   
> >   from mercurial import dirstate, httpconnection, match as match_, util, scmutil
> >   from mercurial.i18n import _
> > +from mercurial import node
> 
> Please extend the existing 'from mercurial import' list ... or at least 
> group them together.

OK, I'll insert new import line before "from mercurial.i18n import _",
because "from mercurial import" line seems to be too long to be
extended.


> >   shortname = '.hglf'
> >   shortnameslash = shortname + '/'
> > @@ -365,3 +366,31 @@
> >           if f[0] not in filelist:
> >               filelist.append(f[0])
> >       return filelist
> > +
> > +def getlfilestoupload(repo, missing, mapfunc=None):
> > +    toupload = set()
> > +    if not missing:
> > +        return toupload
> 
> Do this check make any difference? Looping an empty list is free.

"missing" will be bound to "None" if there are no outgoing revisions,
and it causes exception raising at looping.

Should I explain it in comment ?


> > +    if not mapfunc:
> > +        mapfunc = lambda f, ctx: f
> 
> I not think this mapfunc thing is pretty.
> 
> In the end, it would be nice to tell in the UI which largefile we are 
> pulling/pushing. Not only the hash, but also the largefile name (or one 
> of them).
> 
> Currently outgoing and summary will list all largefiles in outgoing 
> changesets, no matter if they really need pushing or not. Arguably it 
> should only list the largefiles that really are outgoing. For that we 
> need the hashes.

Do you mean that "getlfilestoupload()" should return also hashes in
the way for the improvement (to list or count only really outgoing
largefiles) of outgoing/summary in the future ?


> So how about letting this function return a dict mapping from hash to a 
> filename that uses it? For now the two users of this function can just 
> look at either keys or values.

I chose this (= passing map function, if needed) implementation to
avoid performance impact below when there are many outgoing
largefiles:

  - returning hashes always may slow down outgoing/summary by
    meaningless "ctx[f].data().strip()" for them

  - trying "filename/context => hash" on the result of
    "getlfilestoupload()" causes scanning (maybe large) list again

If these are not so serious and/or you intend improvement of
outgoing/summary in the future as I guess above, I'll change
"getlfilestoupload()" as you mentioned.

 
> /Mads
> 
> > +    for n in missing:
> > +        parents = [p for p in repo.changelog.parents(n) if p != node.nullid]
> > +        ctx = repo[n]
> > +        files = set(ctx.files())
> > +        if len(parents) == 2:
> > +            mc = ctx.manifest()
> > +            mp1 = ctx.parents()[0].manifest()
> > +            mp2 = ctx.parents()[1].manifest()
> > +            for f in mp1:
> > +                if f not in mc:
> > +                    files.add(f)
> > +            for f in mp2:
> > +                if f not in mc:
> > +                    files.add(f)
> > +            for f in mc:
> > +                if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f, None):
> > +                    files.add(f)
> > +        toupload = toupload.union(set([mapfunc(f, ctx)
> > +                                       for f in files
> > +                                       if isstandin(f) and f in ctx]))
> > +    return toupload
> > diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
> > --- a/hgext/largefiles/overrides.py
> > +++ b/hgext/largefiles/overrides.py
> > @@ -12,7 +12,7 @@
> >   import copy
> >   
> >   from mercurial import hg, commands, util, cmdutil, scmutil, match as match_, \
> > -        node, archival, error, merge, discovery, pathutil, revset
> > +        archival, error, merge, discovery, pathutil, revset
> >   from mercurial.i18n import _
> >   from mercurial.node import hex
> >   from hgext import rebase
> > @@ -999,27 +999,7 @@
> >       if opts.get('newest_first'):
> >           o.reverse()
> >   
> > -    toupload = set()
> > -    for n in o:
> > -        parents = [p for p in repo.changelog.parents(n) if p != node.nullid]
> > -        ctx = repo[n]
> > -        files = set(ctx.files())
> > -        if len(parents) == 2:
> > -            mc = ctx.manifest()
> > -            mp1 = ctx.parents()[0].manifest()
> > -            mp2 = ctx.parents()[1].manifest()
> > -            for f in mp1:
> > -                if f not in mc:
> > -                        files.add(f)
> > -            for f in mp2:
> > -                if f not in mc:
> > -                    files.add(f)
> > -            for f in mc:
> > -                if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f, None):
> > -                    files.add(f)
> > -        toupload = toupload.union(
> > -            set([f for f in files if lfutil.isstandin(f) and f in ctx]))
> > -    return sorted(toupload)
> > +    return sorted(lfutil.getlfilestoupload(repo, o))
> >   
> >   def overrideoutgoing(orig, ui, repo, dest=None, **opts):
> >       result = orig(ui, repo, dest, **opts)
> > diff --git a/hgext/largefiles/reposetup.py b/hgext/largefiles/reposetup.py
> > --- a/hgext/largefiles/reposetup.py
> > +++ b/hgext/largefiles/reposetup.py
> > @@ -11,7 +11,6 @@
> >   import os
> >   
> >   from mercurial import error, manifest, match as match_, util, discovery
> > -from mercurial import node as node_
> >   from mercurial.i18n import _
> >   from mercurial import localrepo
> >   
> > @@ -418,32 +417,9 @@
> >               outgoing = discovery.findcommonoutgoing(repo, remote.peer(),
> >                                                       force=force)
> >               if outgoing.missing:
> > -                toupload = set()
> >                   o = self.changelog.nodesbetween(outgoing.missing, revs)[0]
> > -                for n in o:
> > -                    parents = [p for p in self.changelog.parents(n)
> > -                               if p != node_.nullid]
> > -                    ctx = self[n]
> > -                    files = set(ctx.files())
> > -                    if len(parents) == 2:
> > -                        mc = ctx.manifest()
> > -                        mp1 = ctx.parents()[0].manifest()
> > -                        mp2 = ctx.parents()[1].manifest()
> > -                        for f in mp1:
> > -                            if f not in mc:
> > -                                files.add(f)
> > -                        for f in mp2:
> > -                            if f not in mc:
> > -                                files.add(f)
> > -                        for f in mc:
> > -                            if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f,
> > -                                    None):
> > -                                files.add(f)
> > -
> > -                    toupload = toupload.union(
> > -                        set([ctx[f].data().strip()
> > -                             for f in files
> > -                             if lfutil.isstandin(f) and f in ctx]))
> > +                mapfunc = lambda f, ctx: ctx[f].data().strip()
> > +                toupload = lfutil.getlfilestoupload(self, o, mapfunc)
> >                   lfcommands.uploadlfiles(ui, self, remote, toupload)
> >               return super(lfilesrepo, self).push(remote, force=force, revs=revs,
> >                   newbranch=newbranch)
> > _______________________________________________
> > Mercurial-devel mailing list
> > Mercurial-devel@selenic.com
> > http://selenic.com/mailman/listinfo/mercurial-devel
> 
> 


----------------------------------------------------------------------
[FUJIWARA Katsunori]                             foozy@lares.dti.ne.jp
Mads Kiilerich - March 7, 2014, 7:53 p.m.
On 03/07/2014 03:47 PM, FUJIWARA Katsunori wrote:
> At Thu, 06 Mar 2014 15:48:20 +0100, Mads Kiilerich wrote:
>>>    shortname = '.hglf'
>>>    shortnameslash = shortname + '/'
>>> @@ -365,3 +366,31 @@
>>>            if f[0] not in filelist:
>>>                filelist.append(f[0])
>>>        return filelist
>>> +
>>> +def getlfilestoupload(repo, missing, mapfunc=None):
>>> +    toupload = set()
>>> +    if not missing:
>>> +        return toupload
>> Do this check make any difference? Looping an empty list is free.
> "missing" will be bound to "None" if there are no outgoing revisions,
> and it causes exception raising at looping.

Really? It seems like this function in both places replaces a 'for n in 
o' loop?

(Btw: it would be nice with a docstring that says what 'missing' is and 
that the function returns a set of mapfunc results.)

>>> +    if not mapfunc:
>>> +        mapfunc = lambda f, ctx: f
>> I not think this mapfunc thing is pretty.
>>
>> In the end, it would be nice to tell in the UI which largefile we are
>> pulling/pushing. Not only the hash, but also the largefile name (or one
>> of them).
>>
>> Currently outgoing and summary will list all largefiles in outgoing
>> changesets, no matter if they really need pushing or not. Arguably it
>> should only list the largefiles that really are outgoing. For that we
>> need the hashes.
> Do you mean that "getlfilestoupload()" should return also hashes in
> the way for the improvement (to list or count only really outgoing
> largefiles) of outgoing/summary in the future ?

Yes, I think so. Outgoing/summary do not show all revisions - it only 
shows the revisions that are outgoing. Currently outgoing/summary shows 
all largefiles in these revisions, no matter if they are outgoing or not 
(if I understand it correctly) - that seems like a bug. Instead it 
should only show the largefiles that actually will be pushed. There 
could be different ways of indicating that. Showing the involved 
largefile names could be one. Showing the actual number of largefiles 
(unique hashes) could be another, and yet another could be to show the 
total amount of bytes.

What do you think about this?

>> So how about letting this function return a dict mapping from hash to a
>> filename that uses it? For now the two users of this function can just
>> look at either keys or values.
> I chose this (= passing map function, if needed) implementation to
> avoid performance impact below when there are many outgoing
> largefiles:
>
>    - returning hashes always may slow down outgoing/summary by
>      meaningless "ctx[f].data().strip()" for them

A largefile is still a big thing and expensive to shuffle around due to 
its size (and the general design of what it is). My gut feeling is that 
an extra call to .data() will be a small extra cost in the greater 
picture ... especially considering that it (and a lstat call to the 
remote location) is essential for actually giving a useful result for 
outgoing/summary.

>    - trying "filename/context => hash" on the result of
>      "getlfilestoupload()" causes scanning (maybe large) list again

Sorry, I don't understand that.

My thought would be that the function should return a "hash => 
one-of-the-filenames" mapping. I don't think there would be a need for 
any scanning ... or how it would be a problem.

> If these are not so serious and/or you intend improvement of
> outgoing/summary in the future as I guess above, I'll change
> "getlfilestoupload()" as you mentioned.

I don't have any particular plans for that. There is lots of bigger 
problems with largefiles - this nice series is addressing one of them.


Anyway: These were just my thoughts. If you still prefer the way you 
proposed to do these changes then it is fine with me.

/Mads
Katsunori FUJIWARA - March 11, 2014, 10:35 a.m.
At Fri, 07 Mar 2014 20:53:26 +0100,
Mads Kiilerich wrote:
> 
> On 03/07/2014 03:47 PM, FUJIWARA Katsunori wrote:
> > At Thu, 06 Mar 2014 15:48:20 +0100, Mads Kiilerich wrote:
> >>>    shortname = '.hglf'
> >>>    shortnameslash = shortname + '/'
> >>> @@ -365,3 +366,31 @@
> >>>            if f[0] not in filelist:
> >>>                filelist.append(f[0])
> >>>        return filelist
> >>> +
> >>> +def getlfilestoupload(repo, missing, mapfunc=None):
> >>> +    toupload = set()
> >>> +    if not missing:
> >>> +        return toupload
> >> Do this check make any difference? Looping an empty list is free.
> > "missing" will be bound to "None" if there are no outgoing revisions,
> > and it causes exception raising at looping.
> 
> Really? It seems like this function in both places replaces a 'for n in 
> o' loop?

More exactly, "missing" will be bound to "None" if (1) there are no
outgoing revisions, and (2) "getlfilestoupload()" is invoked with the
value returned by "hg._outgoing()".

But it seems that there is no specific reason for "hg._outgoing()" to
return "None" instead of empty list (returning "None" should comes
from 8888e56ac417)

So, I'll clean these code paths up in revised series.


> (Btw: it would be nice with a docstring that says what 'missing' is and 
> that the function returns a set of mapfunc results.)
> 
> >>> +    if not mapfunc:
> >>> +        mapfunc = lambda f, ctx: f
> >> I not think this mapfunc thing is pretty.
> >>
> >> In the end, it would be nice to tell in the UI which largefile we are
> >> pulling/pushing. Not only the hash, but also the largefile name (or one
> >> of them).
> >>
> >> Currently outgoing and summary will list all largefiles in outgoing
> >> changesets, no matter if they really need pushing or not. Arguably it
> >> should only list the largefiles that really are outgoing. For that we
> >> need the hashes.
> > Do you mean that "getlfilestoupload()" should return also hashes in
> > the way for the improvement (to list or count only really outgoing
> > largefiles) of outgoing/summary in the future ?
> 
> Yes, I think so. Outgoing/summary do not show all revisions - it only 
> shows the revisions that are outgoing. Currently outgoing/summary shows 
> all largefiles in these revisions, no matter if they are outgoing or not 
> (if I understand it correctly) - that seems like a bug. Instead it 
> should only show the largefiles that actually will be pushed. There 
> could be different ways of indicating that. Showing the involved 
> largefile names could be one. Showing the actual number of largefiles 
> (unique hashes) could be another, and yet another could be to show the 
> total amount of bytes.
> 
> What do you think about this?

It sounds good. How about the message below ?

    largefiles: NN entities to upload for MM largefiles
    path/to/file .....  (only for "hg outgoinge")
    ....

and below for "hg outgoing --large --debug" ?

    largefiles: NN entities to upload for MM largefiles
    path/to/file (hash)
    path/to/file (hash1 hash2 hash3) # for multiple changes
    ....

"NN" is number of unique hashes, and "MM" is number of paths of
largefiles, to be uploaded.


To keep patches small and simple, I plan to achieve it in steps
below.

  (1) refactor to avoid redundant discovery

      it just centralizes the logic to check like this series.

  (2) check entities which may be uploaded

      get unique hashes of largefiles which may be uploaded to remote,
      and show detailed messages like above.

      in this step, existence of largefiles on remote side is still
      not checked: messages above may show grater NN/MM than exact
      ones.

  (3) check entities which should be uploaded exactly

      in addition to (2), check existence of each largefiles on remote
      side to show exact numbers in outgoing/summary messages.

      this involves "exists" wire access and increases remote access
      cost.

The patch for (3) can be dropped individually, if wire accessing seems
not to be reasonable in performance or so.


> >> So how about letting this function return a dict mapping from hash to a
> >> filename that uses it? For now the two users of this function can just
> >> look at either keys or values.
> > I chose this (= passing map function, if needed) implementation to
> > avoid performance impact below when there are many outgoing
> > largefiles:
> >
> >    - returning hashes always may slow down outgoing/summary by
> >      meaningless "ctx[f].data().strip()" for them
> 
> A largefile is still a big thing and expensive to shuffle around due to 
> its size (and the general design of what it is). My gut feeling is that 
> an extra call to .data() will be a small extra cost in the greater 
> picture ... especially considering that it (and a lstat call to the 
> remote location) is essential for actually giving a useful result for 
> outgoing/summary.
> 
> >    - trying "filename/context => hash" on the result of
> >      "getlfilestoupload()" causes scanning (maybe large) list again
> 
> Sorry, I don't understand that.
> 
> My thought would be that the function should return a "hash => 
> one-of-the-filenames" mapping. I don't think there would be a need for 
> any scanning ... or how it would be a problem.
> 
> > If these are not so serious and/or you intend improvement of
> > outgoing/summary in the future as I guess above, I'll change
> > "getlfilestoupload()" as you mentioned.
> 
> I don't have any particular plans for that. There is lots of bigger 
> problems with largefiles - this nice series is addressing one of them.
> 
> 
> Anyway: These were just my thoughts. If you still prefer the way you 
> proposed to do these changes then it is fine with me.
> 
> /Mads
> 

----------------------------------------------------------------------
[FUJIWARA Katsunori]                             foozy@lares.dti.ne.jp
Mads Kiilerich - April 14, 2014, 1:39 p.m.
On 03/11/2014 11:35 AM, FUJIWARA Katsunori wrote:
> So, I'll clean these code paths up in revised series.

Hi - any news on this one? I would love to get it in before the next freeze.

/Mads

Patch

diff --git a/hgext/largefiles/lfutil.py b/hgext/largefiles/lfutil.py
--- a/hgext/largefiles/lfutil.py
+++ b/hgext/largefiles/lfutil.py
@@ -15,6 +15,7 @@ 
 
 from mercurial import dirstate, httpconnection, match as match_, util, scmutil
 from mercurial.i18n import _
+from mercurial import node
 
 shortname = '.hglf'
 shortnameslash = shortname + '/'
@@ -365,3 +366,31 @@ 
         if f[0] not in filelist:
             filelist.append(f[0])
     return filelist
+
+def getlfilestoupload(repo, missing, mapfunc=None):
+    toupload = set()
+    if not missing:
+        return toupload
+    if not mapfunc:
+        mapfunc = lambda f, ctx: f
+    for n in missing:
+        parents = [p for p in repo.changelog.parents(n) if p != node.nullid]
+        ctx = repo[n]
+        files = set(ctx.files())
+        if len(parents) == 2:
+            mc = ctx.manifest()
+            mp1 = ctx.parents()[0].manifest()
+            mp2 = ctx.parents()[1].manifest()
+            for f in mp1:
+                if f not in mc:
+                    files.add(f)
+            for f in mp2:
+                if f not in mc:
+                    files.add(f)
+            for f in mc:
+                if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f, None):
+                    files.add(f)
+        toupload = toupload.union(set([mapfunc(f, ctx)
+                                       for f in files
+                                       if isstandin(f) and f in ctx]))
+    return toupload
diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
--- a/hgext/largefiles/overrides.py
+++ b/hgext/largefiles/overrides.py
@@ -12,7 +12,7 @@ 
 import copy
 
 from mercurial import hg, commands, util, cmdutil, scmutil, match as match_, \
-        node, archival, error, merge, discovery, pathutil, revset
+        archival, error, merge, discovery, pathutil, revset
 from mercurial.i18n import _
 from mercurial.node import hex
 from hgext import rebase
@@ -999,27 +999,7 @@ 
     if opts.get('newest_first'):
         o.reverse()
 
-    toupload = set()
-    for n in o:
-        parents = [p for p in repo.changelog.parents(n) if p != node.nullid]
-        ctx = repo[n]
-        files = set(ctx.files())
-        if len(parents) == 2:
-            mc = ctx.manifest()
-            mp1 = ctx.parents()[0].manifest()
-            mp2 = ctx.parents()[1].manifest()
-            for f in mp1:
-                if f not in mc:
-                        files.add(f)
-            for f in mp2:
-                if f not in mc:
-                    files.add(f)
-            for f in mc:
-                if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f, None):
-                    files.add(f)
-        toupload = toupload.union(
-            set([f for f in files if lfutil.isstandin(f) and f in ctx]))
-    return sorted(toupload)
+    return sorted(lfutil.getlfilestoupload(repo, o))
 
 def overrideoutgoing(orig, ui, repo, dest=None, **opts):
     result = orig(ui, repo, dest, **opts)
diff --git a/hgext/largefiles/reposetup.py b/hgext/largefiles/reposetup.py
--- a/hgext/largefiles/reposetup.py
+++ b/hgext/largefiles/reposetup.py
@@ -11,7 +11,6 @@ 
 import os
 
 from mercurial import error, manifest, match as match_, util, discovery
-from mercurial import node as node_
 from mercurial.i18n import _
 from mercurial import localrepo
 
@@ -418,32 +417,9 @@ 
             outgoing = discovery.findcommonoutgoing(repo, remote.peer(),
                                                     force=force)
             if outgoing.missing:
-                toupload = set()
                 o = self.changelog.nodesbetween(outgoing.missing, revs)[0]
-                for n in o:
-                    parents = [p for p in self.changelog.parents(n)
-                               if p != node_.nullid]
-                    ctx = self[n]
-                    files = set(ctx.files())
-                    if len(parents) == 2:
-                        mc = ctx.manifest()
-                        mp1 = ctx.parents()[0].manifest()
-                        mp2 = ctx.parents()[1].manifest()
-                        for f in mp1:
-                            if f not in mc:
-                                files.add(f)
-                        for f in mp2:
-                            if f not in mc:
-                                files.add(f)
-                        for f in mc:
-                            if mc[f] != mp1.get(f, None) or mc[f] != mp2.get(f,
-                                    None):
-                                files.add(f)
-
-                    toupload = toupload.union(
-                        set([ctx[f].data().strip()
-                             for f in files
-                             if lfutil.isstandin(f) and f in ctx]))
+                mapfunc = lambda f, ctx: ctx[f].data().strip()
+                toupload = lfutil.getlfilestoupload(self, o, mapfunc)
                 lfcommands.uploadlfiles(ui, self, remote, toupload)
             return super(lfilesrepo, self).push(remote, force=force, revs=revs,
                 newbranch=newbranch)