Patchwork [15,of,15] speedy: add support for arbitrary file patterns in path query

login
register
mail settings
Submitter Tomasz Kleczek
Date Dec. 11, 2012, 6:38 p.m.
Message ID <88deb68f287f6ee05ed8.1355251110@dev408.prn1.facebook.com>
Download mbox | patch
Permalink /patch/62/
State Superseded
Headers show

Comments

Tomasz Kleczek - Dec. 11, 2012, 6:38 p.m.
# HG changeset patch
# User Tomasz Kleczek <tkleczek at fb.com>
# Date 1355207317 28800
# Branch stable
# Node ID 88deb68f287f6ee05ed8e066fc52d01a5a218fe4
# Parent  4b29d21f6585f07c8a55c64cb93ce6178be4e3eb
speedy: add support for arbitrary file patterns in path query

This change speeds up commands such as:

log "glob:*"
log "relglob:*.py"
hg log d2 --include "**.py"

<placeholder: performance info>

Only the fileset patterns are still not supported server-side.
Augie Fackler - Dec. 13, 2012, 3:39 a.m.
Overall, I think I like this. I'm excited that the super-simple implementation offers such great performance benefits. One thing I wonder about is if the local implementation (I'm assuming y'all are planning a large-scale logindex-as-a-service thing at fb) should require that the client have all the indexed changes, and make it possible to do reindexing after pulls.

Another thing is that maybe there could be support for multiple repositories somehow in the protocol, but I'm not sure that really makes sense on the other hand.

On Dec 11, 2012, at 12:38 PM, Tomasz Kleczek <tkleczek at fb.com> wrote:

> # HG changeset patch
> # User Tomasz Kleczek <tkleczek at fb.com>
> # Date 1355207317 28800
> # Branch stable
> # Node ID 88deb68f287f6ee05ed8e066fc52d01a5a218fe4
> # Parent  4b29d21f6585f07c8a55c64cb93ce6178be4e3eb
> speedy: add support for arbitrary file patterns in path query
> 
> This change speeds up commands such as:
> 
> log "glob:*"
> log "relglob:*.py"
> hg log d2 --include "**.py"
> 
> <placeholder: performance info>
> 
> Only the fileset patterns are still not supported server-side.
> 
> diff --git a/hgext/speedy/client.py b/hgext/speedy/client.py
> --- a/hgext/speedy/client.py
> +++ b/hgext/speedy/client.py
> @@ -91,18 +91,26 @@
>         `rev` is in fncache then fncache[rev] is a list with filenames
>         this rev modifies that are matching according to `match`.
> 
> -        If match containts only 'path' or 'relpath' patterns the query is sent
> +        If match doesn't contain any fileset expressions, the query is sent
>         to the server, otherwise it is performed locally as server doesn't
> -        support arbitrary patterns just yet.
> +        support fileset expressions.
>         """
> -        anynonpaths = bool(filter(lambda (k, v): k != 'path', match._pats))
> -        if match._includepats or match._excludepats or anynonpaths:
> -            # For now, server supports only literal paths and not arbitrary
> -            # patterns, fall back to the local query
> +        allpats = match._pats + match._includepats + match._excludepats
> +        anyfilesetexp = bool(filter(lambda (k, v): k == 'set', allpats))
> +        if anyfilesetexp:
> +            # Server doesn't support filesets, fall back to the local
> +            # fileset query
>             return cmdutil.filterrevs(self._repo, list(self._repo), match)
>         else:
> -            paths = [v for k, v in match._pats]
> -            wanted = self._proxy.request('path', (paths,))
> +            # Would like to send the match object over the network, but
> +            # it is not possible as it contains functions and working
> +            # dir context object. Send all parameters used to init the
> +            # object in a dict instead and create the object server-side.
> +            # match._ctx field is used only if one of the patterns to match is
> +            # a 'set:' pattern. Therefore we do not loose any information here.
> +            matchdict = dict(patterns=match._pats, include=match._includepats,
> +                    exclude=match._excludepats)
> +            wanted = self._proxy.request('path', (matchdict,))
>             return set(nodestorevs(self._repo, wanted)), {}
> 
> def _patchedauthor(metapeer, repo, subset, pats):
> diff --git a/hgext/speedy/index.py b/hgext/speedy/index.py
> --- a/hgext/speedy/index.py
> +++ b/hgext/speedy/index.py
> @@ -58,3 +58,16 @@
>         for path in paths:
>             filechgs.setdefault(path, []).append(ctx.node())
>     return filechgs
> +
> +def makefiles(ctxs):
> +    """Return the `files` index in the form of a dict.
> +
> +    `files` is keyed by file name, with each value being an empty string
> +
> +    ctxs: an iterable with change contexts for changes from which the index
> +        is to be created.
> +    """
> +    files = set()
> +    for ctx in ctxs:
> +        files.update(ctx.files())
> +    return dict([(fn, '') for fn in files])
> diff --git a/hgext/speedy/server.py b/hgext/speedy/server.py
> --- a/hgext/speedy/server.py
> +++ b/hgext/speedy/server.py
> @@ -14,6 +14,7 @@
> from mercurial import cmdutil
> from mercurial.i18n import _
> from mercurial import util
> +from mercurial import match as matchmod
> import index
> import protocol
> import tcptransport
> @@ -59,10 +60,44 @@
>         return [node for node, date in self.chgdate.iteritems()
>             if matcher(date)]
> 
> -    def path(self, paths):
> -        """Return a list of changesets that modify any of the paths.
> +    def path(self, matchargs):
> +        """Return a list of changesets that modify the specified paths.
> 
> -        Only the changes present in `subset` are returned.
> +        matchargs: a dict that may contain the following parameters:
> +            'patterns' - a pattern list
> +            'include' - a pattern list describing additional paths
> +                to include
> +            'exclude' - a pattern list describing additional paths
> +                to exclude
> +
> +        A pattern list is a an iterable with pairs of (kind, pattern),
> +                kind may be any pattern kind recognized by match.match
> +                constructor except for 'relpath' and 'set'.
> +
> +        If all patterns are literal paths, we can compute answer very
> +        fast using `filechgs` index (see _literalpath method). Otherwise,
> +        we have to fall back to an exhaustive search (see _patternpath
> +        method).
> +        """
> +        pats = matchargs.get('patterns', [])
> +        include = matchargs.get('include', [])
> +        exclude = matchargs.get('exclude', [])
> +        kinds = [k for k, v in pats]
> +        if not include and not exclude and kinds == ['path'] * len(kinds):
> +            paths = [ v for k, v in pats ]
> +            wanted = self._literalpath(paths)
> +        else:
> +            def patsconvert(pats):
> +                return[':'.join(p) for p in pats]
> +            patterns = patsconvert(pats)
> +            match = matchmod.match(self.repo.root, self.repo.root,
> +                    patterns, include=patsconvert(include),
> +                    exclude=patsconvert(exclude))
> +            wanted = self._patternpath(match)
> +        return wanted
> +
> +    def _literalpath(self, paths):
> +        """Return a list of changesets touching any of the paths.
> 
>         Uses `filechgs` index which provides the mapping from paths
>         (files and directories) to a list of changes touching this path.
> @@ -74,10 +109,23 @@
>             nodes.update(newnodes)
>         return list(nodes)
> 
> +    def _patternpath(self, match):
> +        """Return a list of changesets matching given files.
> +
> +        match: a callable that defines which files are relevant.
> +               File is relevant if match(filename) == True.
> +
> +        Slow compared to _literalpath since it iterates through all filenames
> +        in the repository history.
> +        """
> +        matchingfiles = filter(match, self.files.keys())
> +        return self._literalpath(matchingfiles)
> +
> indicecfg = {
>     'userchgs': index.makeuserchgs,
>     'chgdate': index.makechgdate,
>     'filechgs': index.makefilechgs,
> +    'files': index.makefiles,
> }
> 
> def makeserver(repo):
> diff --git a/tests/test-speedy.t b/tests/test-speedy.t
> --- a/tests/test-speedy.t
> +++ b/tests/test-speedy.t
> @@ -273,10 +273,10 @@
>   $ hg log --rev "date(10/20/2012) & user(testuser2)"
>   chg1
> 
> -  $ cat >> $TESTTMP/localrepo/.hg/hgrc <<EOF_END
> -  > [speedy]
> -  > client = False
> -  > EOF_END
> +  $ hg log "glob:d2/*" --exclude "**.py"
> +  chgx
> +  chgpushed
> +  chg4
> 
>   $ cd $TESTTMP/serverrepo
> 
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
Tomasz Kleczek - Dec. 13, 2012, 7:46 a.m.
In the next couple of patches, that I will send tomorrow (hopefully) I
added persistent storage for indices with leveldb.
For our repository all the indices combined are only 1% of a size of the
whole repo. So I would not be concerned about
the disk usage overhead. The extension is going to update the indices to
the tip of the repo each time log command is being run.
Later we might consider doing it in some pull/push hooks.. (?)

What do you mean by 'support for multiple repositories'?


On Wed, Dec 12, 2012 at 7:39 PM, Augie Fackler <raf at durin42.com> wrote:

> Overall, I think I like this. I'm excited that the super-simple
> implementation offers such great performance benefits. One thing I wonder
> about is if the local implementation (I'm assuming y'all are planning a
> large-scale logindex-as-a-service thing at fb) should require that the
> client have all the indexed changes, and make it possible to do reindexing
> after pulls.
>
> Another thing is that maybe there could be support for multiple
> repositories somehow in the protocol, but I'm not sure that really makes
> sense on the other hand.
>
> On Dec 11, 2012, at 12:38 PM, Tomasz Kleczek <tkleczek at fb.com> wrote:
>
> > # HG changeset patch
> > # User Tomasz Kleczek <tkleczek at fb.com>
> > # Date 1355207317 28800
> > # Branch stable
> > # Node ID 88deb68f287f6ee05ed8e066fc52d01a5a218fe4
> > # Parent  4b29d21f6585f07c8a55c64cb93ce6178be4e3eb
> > speedy: add support for arbitrary file patterns in path query
> >
> > This change speeds up commands such as:
> >
> > log "glob:*"
> > log "relglob:*.py"
> > hg log d2 --include "**.py"
> >
> > <placeholder: performance info>
> >
> > Only the fileset patterns are still not supported server-side.
> >
> > diff --git a/hgext/speedy/client.py b/hgext/speedy/client.py
> > --- a/hgext/speedy/client.py
> > +++ b/hgext/speedy/client.py
> > @@ -91,18 +91,26 @@
> >         `rev` is in fncache then fncache[rev] is a list with filenames
> >         this rev modifies that are matching according to `match`.
> >
> > -        If match containts only 'path' or 'relpath' patterns the query
> is sent
> > +        If match doesn't contain any fileset expressions, the query is
> sent
> >         to the server, otherwise it is performed locally as server
> doesn't
> > -        support arbitrary patterns just yet.
> > +        support fileset expressions.
> >         """
> > -        anynonpaths = bool(filter(lambda (k, v): k != 'path',
> match._pats))
> > -        if match._includepats or match._excludepats or anynonpaths:
> > -            # For now, server supports only literal paths and not
> arbitrary
> > -            # patterns, fall back to the local query
> > +        allpats = match._pats + match._includepats + match._excludepats
> > +        anyfilesetexp = bool(filter(lambda (k, v): k == 'set', allpats))
> > +        if anyfilesetexp:
> > +            # Server doesn't support filesets, fall back to the local
> > +            # fileset query
> >             return cmdutil.filterrevs(self._repo, list(self._repo),
> match)
> >         else:
> > -            paths = [v for k, v in match._pats]
> > -            wanted = self._proxy.request('path', (paths,))
> > +            # Would like to send the match object over the network, but
> > +            # it is not possible as it contains functions and working
> > +            # dir context object. Send all parameters used to init the
> > +            # object in a dict instead and create the object
> server-side.
> > +            # match._ctx field is used only if one of the patterns to
> match is
> > +            # a 'set:' pattern. Therefore we do not loose any
> information here.
> > +            matchdict = dict(patterns=match._pats,
> include=match._includepats,
> > +                    exclude=match._excludepats)
> > +            wanted = self._proxy.request('path', (matchdict,))
> >             return set(nodestorevs(self._repo, wanted)), {}
> >
> > def _patchedauthor(metapeer, repo, subset, pats):
> > diff --git a/hgext/speedy/index.py b/hgext/speedy/index.py
> > --- a/hgext/speedy/index.py
> > +++ b/hgext/speedy/index.py
> > @@ -58,3 +58,16 @@
> >         for path in paths:
> >             filechgs.setdefault(path, []).append(ctx.node())
> >     return filechgs
> > +
> > +def makefiles(ctxs):
> > +    """Return the `files` index in the form of a dict.
> > +
> > +    `files` is keyed by file name, with each value being an empty string
> > +
> > +    ctxs: an iterable with change contexts for changes from which the
> index
> > +        is to be created.
> > +    """
> > +    files = set()
> > +    for ctx in ctxs:
> > +        files.update(ctx.files())
> > +    return dict([(fn, '') for fn in files])
> > diff --git a/hgext/speedy/server.py b/hgext/speedy/server.py
> > --- a/hgext/speedy/server.py
> > +++ b/hgext/speedy/server.py
> > @@ -14,6 +14,7 @@
> > from mercurial import cmdutil
> > from mercurial.i18n import _
> > from mercurial import util
> > +from mercurial import match as matchmod
> > import index
> > import protocol
> > import tcptransport
> > @@ -59,10 +60,44 @@
> >         return [node for node, date in self.chgdate.iteritems()
> >             if matcher(date)]
> >
> > -    def path(self, paths):
> > -        """Return a list of changesets that modify any of the paths.
> > +    def path(self, matchargs):
> > +        """Return a list of changesets that modify the specified paths.
> >
> > -        Only the changes present in `subset` are returned.
> > +        matchargs: a dict that may contain the following parameters:
> > +            'patterns' - a pattern list
> > +            'include' - a pattern list describing additional paths
> > +                to include
> > +            'exclude' - a pattern list describing additional paths
> > +                to exclude
> > +
> > +        A pattern list is a an iterable with pairs of (kind, pattern),
> > +                kind may be any pattern kind recognized by match.match
> > +                constructor except for 'relpath' and 'set'.
> > +
> > +        If all patterns are literal paths, we can compute answer very
> > +        fast using `filechgs` index (see _literalpath method).
> Otherwise,
> > +        we have to fall back to an exhaustive search (see _patternpath
> > +        method).
> > +        """
> > +        pats = matchargs.get('patterns', [])
> > +        include = matchargs.get('include', [])
> > +        exclude = matchargs.get('exclude', [])
> > +        kinds = [k for k, v in pats]
> > +        if not include and not exclude and kinds == ['path'] *
> len(kinds):
> > +            paths = [ v for k, v in pats ]
> > +            wanted = self._literalpath(paths)
> > +        else:
> > +            def patsconvert(pats):
> > +                return[':'.join(p) for p in pats]
> > +            patterns = patsconvert(pats)
> > +            match = matchmod.match(self.repo.root, self.repo.root,
> > +                    patterns, include=patsconvert(include),
> > +                    exclude=patsconvert(exclude))
> > +            wanted = self._patternpath(match)
> > +        return wanted
> > +
> > +    def _literalpath(self, paths):
> > +        """Return a list of changesets touching any of the paths.
> >
> >         Uses `filechgs` index which provides the mapping from paths
> >         (files and directories) to a list of changes touching this path.
> > @@ -74,10 +109,23 @@
> >             nodes.update(newnodes)
> >         return list(nodes)
> >
> > +    def _patternpath(self, match):
> > +        """Return a list of changesets matching given files.
> > +
> > +        match: a callable that defines which files are relevant.
> > +               File is relevant if match(filename) == True.
> > +
> > +        Slow compared to _literalpath since it iterates through all
> filenames
> > +        in the repository history.
> > +        """
> > +        matchingfiles = filter(match, self.files.keys())
> > +        return self._literalpath(matchingfiles)
> > +
> > indicecfg = {
> >     'userchgs': index.makeuserchgs,
> >     'chgdate': index.makechgdate,
> >     'filechgs': index.makefilechgs,
> > +    'files': index.makefiles,
> > }
> >
> > def makeserver(repo):
> > diff --git a/tests/test-speedy.t b/tests/test-speedy.t
> > --- a/tests/test-speedy.t
> > +++ b/tests/test-speedy.t
> > @@ -273,10 +273,10 @@
> >   $ hg log --rev "date(10/20/2012) & user(testuser2)"
> >   chg1
> >
> > -  $ cat >> $TESTTMP/localrepo/.hg/hgrc <<EOF_END
> > -  > [speedy]
> > -  > client = False
> > -  > EOF_END
> > +  $ hg log "glob:d2/*" --exclude "**.py"
> > +  chgx
> > +  chgpushed
> > +  chg4
> >
> >   $ cd $TESTTMP/serverrepo
> >
> > _______________________________________________
> > Mercurial-devel mailing list
> > Mercurial-devel at selenic.com
> > http://selenic.com/mailman/listinfo/mercurial-devel
>
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://selenic.com/pipermail/mercurial-devel/attachments/20121212/59be59c2/attachment.html>
Augie Fackler - Dec. 13, 2012, 2:46 p.m.
On Dec 13, 2012, at 1:46 AM, Tomasz K?eczek <tomasz.kleczek at gmail.com> wrote:

> In the next couple of patches, that I will send tomorrow (hopefully) I
> added persistent storage for indices with leveldb.
> For our repository all the indices combined are only 1% of a size of the
> whole repo. So I would not be concerned about
> the disk usage overhead. The extension is going to update the indices to
> the tip of the repo each time log command is being run.
> Later we might consider doing it in some pull/push hooks.. (?)
> 
> What do you mean by 'support for multiple repositories'?

Ah, it sounds like you expect users to run this on their local machine, rather than as a shared service in a production datacenter?

> 
> 
> On Wed, Dec 12, 2012 at 7:39 PM, Augie Fackler <raf at durin42.com> wrote:
> 
>> Overall, I think I like this. I'm excited that the super-simple
>> implementation offers such great performance benefits. One thing I wonder
>> about is if the local implementation (I'm assuming y'all are planning a
>> large-scale logindex-as-a-service thing at fb) should require that the
>> client have all the indexed changes, and make it possible to do reindexing
>> after pulls.
>> 
>> Another thing is that maybe there could be support for multiple
>> repositories somehow in the protocol, but I'm not sure that really makes
>> sense on the other hand.
>> 
>> On Dec 11, 2012, at 12:38 PM, Tomasz Kleczek <tkleczek at fb.com> wrote:
>> 
>>> # HG changeset patch
>>> # User Tomasz Kleczek <tkleczek at fb.com>
>>> # Date 1355207317 28800
>>> # Branch stable
>>> # Node ID 88deb68f287f6ee05ed8e066fc52d01a5a218fe4
>>> # Parent  4b29d21f6585f07c8a55c64cb93ce6178be4e3eb
>>> speedy: add support for arbitrary file patterns in path query
>>> 
>>> This change speeds up commands such as:
>>> 
>>> log "glob:*"
>>> log "relglob:*.py"
>>> hg log d2 --include "**.py"
>>> 
>>> <placeholder: performance info>
>>> 
>>> Only the fileset patterns are still not supported server-side.
>>> 
>>> diff --git a/hgext/speedy/client.py b/hgext/speedy/client.py
>>> --- a/hgext/speedy/client.py
>>> +++ b/hgext/speedy/client.py
>>> @@ -91,18 +91,26 @@
>>>        `rev` is in fncache then fncache[rev] is a list with filenames
>>>        this rev modifies that are matching according to `match`.
>>> 
>>> -        If match containts only 'path' or 'relpath' patterns the query
>> is sent
>>> +        If match doesn't contain any fileset expressions, the query is
>> sent
>>>        to the server, otherwise it is performed locally as server
>> doesn't
>>> -        support arbitrary patterns just yet.
>>> +        support fileset expressions.
>>>        """
>>> -        anynonpaths = bool(filter(lambda (k, v): k != 'path',
>> match._pats))
>>> -        if match._includepats or match._excludepats or anynonpaths:
>>> -            # For now, server supports only literal paths and not
>> arbitrary
>>> -            # patterns, fall back to the local query
>>> +        allpats = match._pats + match._includepats + match._excludepats
>>> +        anyfilesetexp = bool(filter(lambda (k, v): k == 'set', allpats))
>>> +        if anyfilesetexp:
>>> +            # Server doesn't support filesets, fall back to the local
>>> +            # fileset query
>>>            return cmdutil.filterrevs(self._repo, list(self._repo),
>> match)
>>>        else:
>>> -            paths = [v for k, v in match._pats]
>>> -            wanted = self._proxy.request('path', (paths,))
>>> +            # Would like to send the match object over the network, but
>>> +            # it is not possible as it contains functions and working
>>> +            # dir context object. Send all parameters used to init the
>>> +            # object in a dict instead and create the object
>> server-side.
>>> +            # match._ctx field is used only if one of the patterns to
>> match is
>>> +            # a 'set:' pattern. Therefore we do not loose any
>> information here.
>>> +            matchdict = dict(patterns=match._pats,
>> include=match._includepats,
>>> +                    exclude=match._excludepats)
>>> +            wanted = self._proxy.request('path', (matchdict,))
>>>            return set(nodestorevs(self._repo, wanted)), {}
>>> 
>>> def _patchedauthor(metapeer, repo, subset, pats):
>>> diff --git a/hgext/speedy/index.py b/hgext/speedy/index.py
>>> --- a/hgext/speedy/index.py
>>> +++ b/hgext/speedy/index.py
>>> @@ -58,3 +58,16 @@
>>>        for path in paths:
>>>            filechgs.setdefault(path, []).append(ctx.node())
>>>    return filechgs
>>> +
>>> +def makefiles(ctxs):
>>> +    """Return the `files` index in the form of a dict.
>>> +
>>> +    `files` is keyed by file name, with each value being an empty string
>>> +
>>> +    ctxs: an iterable with change contexts for changes from which the
>> index
>>> +        is to be created.
>>> +    """
>>> +    files = set()
>>> +    for ctx in ctxs:
>>> +        files.update(ctx.files())
>>> +    return dict([(fn, '') for fn in files])
>>> diff --git a/hgext/speedy/server.py b/hgext/speedy/server.py
>>> --- a/hgext/speedy/server.py
>>> +++ b/hgext/speedy/server.py
>>> @@ -14,6 +14,7 @@
>>> from mercurial import cmdutil
>>> from mercurial.i18n import _
>>> from mercurial import util
>>> +from mercurial import match as matchmod
>>> import index
>>> import protocol
>>> import tcptransport
>>> @@ -59,10 +60,44 @@
>>>        return [node for node, date in self.chgdate.iteritems()
>>>            if matcher(date)]
>>> 
>>> -    def path(self, paths):
>>> -        """Return a list of changesets that modify any of the paths.
>>> +    def path(self, matchargs):
>>> +        """Return a list of changesets that modify the specified paths.
>>> 
>>> -        Only the changes present in `subset` are returned.
>>> +        matchargs: a dict that may contain the following parameters:
>>> +            'patterns' - a pattern list
>>> +            'include' - a pattern list describing additional paths
>>> +                to include
>>> +            'exclude' - a pattern list describing additional paths
>>> +                to exclude
>>> +
>>> +        A pattern list is a an iterable with pairs of (kind, pattern),
>>> +                kind may be any pattern kind recognized by match.match
>>> +                constructor except for 'relpath' and 'set'.
>>> +
>>> +        If all patterns are literal paths, we can compute answer very
>>> +        fast using `filechgs` index (see _literalpath method).
>> Otherwise,
>>> +        we have to fall back to an exhaustive search (see _patternpath
>>> +        method).
>>> +        """
>>> +        pats = matchargs.get('patterns', [])
>>> +        include = matchargs.get('include', [])
>>> +        exclude = matchargs.get('exclude', [])
>>> +        kinds = [k for k, v in pats]
>>> +        if not include and not exclude and kinds == ['path'] *
>> len(kinds):
>>> +            paths = [ v for k, v in pats ]
>>> +            wanted = self._literalpath(paths)
>>> +        else:
>>> +            def patsconvert(pats):
>>> +                return[':'.join(p) for p in pats]
>>> +            patterns = patsconvert(pats)
>>> +            match = matchmod.match(self.repo.root, self.repo.root,
>>> +                    patterns, include=patsconvert(include),
>>> +                    exclude=patsconvert(exclude))
>>> +            wanted = self._patternpath(match)
>>> +        return wanted
>>> +
>>> +    def _literalpath(self, paths):
>>> +        """Return a list of changesets touching any of the paths.
>>> 
>>>        Uses `filechgs` index which provides the mapping from paths
>>>        (files and directories) to a list of changes touching this path.
>>> @@ -74,10 +109,23 @@
>>>            nodes.update(newnodes)
>>>        return list(nodes)
>>> 
>>> +    def _patternpath(self, match):
>>> +        """Return a list of changesets matching given files.
>>> +
>>> +        match: a callable that defines which files are relevant.
>>> +               File is relevant if match(filename) == True.
>>> +
>>> +        Slow compared to _literalpath since it iterates through all
>> filenames
>>> +        in the repository history.
>>> +        """
>>> +        matchingfiles = filter(match, self.files.keys())
>>> +        return self._literalpath(matchingfiles)
>>> +
>>> indicecfg = {
>>>    'userchgs': index.makeuserchgs,
>>>    'chgdate': index.makechgdate,
>>>    'filechgs': index.makefilechgs,
>>> +    'files': index.makefiles,
>>> }
>>> 
>>> def makeserver(repo):
>>> diff --git a/tests/test-speedy.t b/tests/test-speedy.t
>>> --- a/tests/test-speedy.t
>>> +++ b/tests/test-speedy.t
>>> @@ -273,10 +273,10 @@
>>>  $ hg log --rev "date(10/20/2012) & user(testuser2)"
>>>  chg1
>>> 
>>> -  $ cat >> $TESTTMP/localrepo/.hg/hgrc <<EOF_END
>>> -  > [speedy]
>>> -  > client = False
>>> -  > EOF_END
>>> +  $ hg log "glob:d2/*" --exclude "**.py"
>>> +  chgx
>>> +  chgpushed
>>> +  chg4
>>> 
>>>  $ cd $TESTTMP/serverrepo
>>> 
>>> _______________________________________________
>>> Mercurial-devel mailing list
>>> Mercurial-devel at selenic.com
>>> http://selenic.com/mailman/listinfo/mercurial-devel
>> 
>> _______________________________________________
>> Mercurial-devel mailing list
>> Mercurial-devel at selenic.com
>> http://selenic.com/mailman/listinfo/mercurial-devel
>>
Tomasz Kleczek - Dec. 13, 2012, 4:24 p.m.
It is certainly an option.

For now, running it locally takes slight more time for (2.5 sec vs 1.3 for
some random log command)  because of database initialization. There is also
an overhead after pull/push. Also, if the update is interrupted, the
database might end up in an inconsistent state, so it has to be recreated
on the next log run. But this may be expensive for big repos (it takes ~ 3
mins on ours). And on the server you can also consider keeping all the
indices in memory to reduce latency.

But running the extension in local mode still offers great performance
improvement, is easier to configure and required almost non overhead in
code (we wanted the server to persistently store indices anyway) so I saw
no reason not to expose it as a feature.


On Thu, Dec 13, 2012 at 6:46 AM, Augie Fackler <raf at durin42.com> wrote:

>
> On Dec 13, 2012, at 1:46 AM, Tomasz K?eczek <tomasz.kleczek at gmail.com>
> wrote:
>
> > In the next couple of patches, that I will send tomorrow (hopefully) I
> > added persistent storage for indices with leveldb.
> > For our repository all the indices combined are only 1% of a size of the
> > whole repo. So I would not be concerned about
> > the disk usage overhead. The extension is going to update the indices to
> > the tip of the repo each time log command is being run.
> > Later we might consider doing it in some pull/push hooks.. (?)
> >
> > What do you mean by 'support for multiple repositories'?
>
> Ah, it sounds like you expect users to run this on their local machine,
> rather than as a shared service in a production datacenter?
>
> >
> >
> > On Wed, Dec 12, 2012 at 7:39 PM, Augie Fackler <raf at durin42.com> wrote:
> >
> >> Overall, I think I like this. I'm excited that the super-simple
> >> implementation offers such great performance benefits. One thing I
> wonder
> >> about is if the local implementation (I'm assuming y'all are planning a
> >> large-scale logindex-as-a-service thing at fb) should require that the
> >> client have all the indexed changes, and make it possible to do
> reindexing
> >> after pulls.
> >>
> >> Another thing is that maybe there could be support for multiple
> >> repositories somehow in the protocol, but I'm not sure that really makes
> >> sense on the other hand.
> >>
> >> On Dec 11, 2012, at 12:38 PM, Tomasz Kleczek <tkleczek at fb.com> wrote:
> >>
> >>> # HG changeset patch
> >>> # User Tomasz Kleczek <tkleczek at fb.com>
> >>> # Date 1355207317 28800
> >>> # Branch stable
> >>> # Node ID 88deb68f287f6ee05ed8e066fc52d01a5a218fe4
> >>> # Parent  4b29d21f6585f07c8a55c64cb93ce6178be4e3eb
> >>> speedy: add support for arbitrary file patterns in path query
> >>>
> >>> This change speeds up commands such as:
> >>>
> >>> log "glob:*"
> >>> log "relglob:*.py"
> >>> hg log d2 --include "**.py"
> >>>
> >>> <placeholder: performance info>
> >>>
> >>> Only the fileset patterns are still not supported server-side.
> >>>
> >>> diff --git a/hgext/speedy/client.py b/hgext/speedy/client.py
> >>> --- a/hgext/speedy/client.py
> >>> +++ b/hgext/speedy/client.py
> >>> @@ -91,18 +91,26 @@
> >>>        `rev` is in fncache then fncache[rev] is a list with filenames
> >>>        this rev modifies that are matching according to `match`.
> >>>
> >>> -        If match containts only 'path' or 'relpath' patterns the query
> >> is sent
> >>> +        If match doesn't contain any fileset expressions, the query is
> >> sent
> >>>        to the server, otherwise it is performed locally as server
> >> doesn't
> >>> -        support arbitrary patterns just yet.
> >>> +        support fileset expressions.
> >>>        """
> >>> -        anynonpaths = bool(filter(lambda (k, v): k != 'path',
> >> match._pats))
> >>> -        if match._includepats or match._excludepats or anynonpaths:
> >>> -            # For now, server supports only literal paths and not
> >> arbitrary
> >>> -            # patterns, fall back to the local query
> >>> +        allpats = match._pats + match._includepats +
> match._excludepats
> >>> +        anyfilesetexp = bool(filter(lambda (k, v): k == 'set',
> allpats))
> >>> +        if anyfilesetexp:
> >>> +            # Server doesn't support filesets, fall back to the local
> >>> +            # fileset query
> >>>            return cmdutil.filterrevs(self._repo, list(self._repo),
> >> match)
> >>>        else:
> >>> -            paths = [v for k, v in match._pats]
> >>> -            wanted = self._proxy.request('path', (paths,))
> >>> +            # Would like to send the match object over the network,
> but
> >>> +            # it is not possible as it contains functions and working
> >>> +            # dir context object. Send all parameters used to init the
> >>> +            # object in a dict instead and create the object
> >> server-side.
> >>> +            # match._ctx field is used only if one of the patterns to
> >> match is
> >>> +            # a 'set:' pattern. Therefore we do not loose any
> >> information here.
> >>> +            matchdict = dict(patterns=match._pats,
> >> include=match._includepats,
> >>> +                    exclude=match._excludepats)
> >>> +            wanted = self._proxy.request('path', (matchdict,))
> >>>            return set(nodestorevs(self._repo, wanted)), {}
> >>>
> >>> def _patchedauthor(metapeer, repo, subset, pats):
> >>> diff --git a/hgext/speedy/index.py b/hgext/speedy/index.py
> >>> --- a/hgext/speedy/index.py
> >>> +++ b/hgext/speedy/index.py
> >>> @@ -58,3 +58,16 @@
> >>>        for path in paths:
> >>>            filechgs.setdefault(path, []).append(ctx.node())
> >>>    return filechgs
> >>> +
> >>> +def makefiles(ctxs):
> >>> +    """Return the `files` index in the form of a dict.
> >>> +
> >>> +    `files` is keyed by file name, with each value being an empty
> string
> >>> +
> >>> +    ctxs: an iterable with change contexts for changes from which the
> >> index
> >>> +        is to be created.
> >>> +    """
> >>> +    files = set()
> >>> +    for ctx in ctxs:
> >>> +        files.update(ctx.files())
> >>> +    return dict([(fn, '') for fn in files])
> >>> diff --git a/hgext/speedy/server.py b/hgext/speedy/server.py
> >>> --- a/hgext/speedy/server.py
> >>> +++ b/hgext/speedy/server.py
> >>> @@ -14,6 +14,7 @@
> >>> from mercurial import cmdutil
> >>> from mercurial.i18n import _
> >>> from mercurial import util
> >>> +from mercurial import match as matchmod
> >>> import index
> >>> import protocol
> >>> import tcptransport
> >>> @@ -59,10 +60,44 @@
> >>>        return [node for node, date in self.chgdate.iteritems()
> >>>            if matcher(date)]
> >>>
> >>> -    def path(self, paths):
> >>> -        """Return a list of changesets that modify any of the paths.
> >>> +    def path(self, matchargs):
> >>> +        """Return a list of changesets that modify the specified
> paths.
> >>>
> >>> -        Only the changes present in `subset` are returned.
> >>> +        matchargs: a dict that may contain the following parameters:
> >>> +            'patterns' - a pattern list
> >>> +            'include' - a pattern list describing additional paths
> >>> +                to include
> >>> +            'exclude' - a pattern list describing additional paths
> >>> +                to exclude
> >>> +
> >>> +        A pattern list is a an iterable with pairs of (kind, pattern),
> >>> +                kind may be any pattern kind recognized by match.match
> >>> +                constructor except for 'relpath' and 'set'.
> >>> +
> >>> +        If all patterns are literal paths, we can compute answer very
> >>> +        fast using `filechgs` index (see _literalpath method).
> >> Otherwise,
> >>> +        we have to fall back to an exhaustive search (see _patternpath
> >>> +        method).
> >>> +        """
> >>> +        pats = matchargs.get('patterns', [])
> >>> +        include = matchargs.get('include', [])
> >>> +        exclude = matchargs.get('exclude', [])
> >>> +        kinds = [k for k, v in pats]
> >>> +        if not include and not exclude and kinds == ['path'] *
> >> len(kinds):
> >>> +            paths = [ v for k, v in pats ]
> >>> +            wanted = self._literalpath(paths)
> >>> +        else:
> >>> +            def patsconvert(pats):
> >>> +                return[':'.join(p) for p in pats]
> >>> +            patterns = patsconvert(pats)
> >>> +            match = matchmod.match(self.repo.root, self.repo.root,
> >>> +                    patterns, include=patsconvert(include),
> >>> +                    exclude=patsconvert(exclude))
> >>> +            wanted = self._patternpath(match)
> >>> +        return wanted
> >>> +
> >>> +    def _literalpath(self, paths):
> >>> +        """Return a list of changesets touching any of the paths.
> >>>
> >>>        Uses `filechgs` index which provides the mapping from paths
> >>>        (files and directories) to a list of changes touching this path.
> >>> @@ -74,10 +109,23 @@
> >>>            nodes.update(newnodes)
> >>>        return list(nodes)
> >>>
> >>> +    def _patternpath(self, match):
> >>> +        """Return a list of changesets matching given files.
> >>> +
> >>> +        match: a callable that defines which files are relevant.
> >>> +               File is relevant if match(filename) == True.
> >>> +
> >>> +        Slow compared to _literalpath since it iterates through all
> >> filenames
> >>> +        in the repository history.
> >>> +        """
> >>> +        matchingfiles = filter(match, self.files.keys())
> >>> +        return self._literalpath(matchingfiles)
> >>> +
> >>> indicecfg = {
> >>>    'userchgs': index.makeuserchgs,
> >>>    'chgdate': index.makechgdate,
> >>>    'filechgs': index.makefilechgs,
> >>> +    'files': index.makefiles,
> >>> }
> >>>
> >>> def makeserver(repo):
> >>> diff --git a/tests/test-speedy.t b/tests/test-speedy.t
> >>> --- a/tests/test-speedy.t
> >>> +++ b/tests/test-speedy.t
> >>> @@ -273,10 +273,10 @@
> >>>  $ hg log --rev "date(10/20/2012) & user(testuser2)"
> >>>  chg1
> >>>
> >>> -  $ cat >> $TESTTMP/localrepo/.hg/hgrc <<EOF_END
> >>> -  > [speedy]
> >>> -  > client = False
> >>> -  > EOF_END
> >>> +  $ hg log "glob:d2/*" --exclude "**.py"
> >>> +  chgx
> >>> +  chgpushed
> >>> +  chg4
> >>>
> >>>  $ cd $TESTTMP/serverrepo
> >>>
> >>> _______________________________________________
> >>> Mercurial-devel mailing list
> >>> Mercurial-devel at selenic.com
> >>> http://selenic.com/mailman/listinfo/mercurial-devel
> >>
> >> _______________________________________________
> >> Mercurial-devel mailing list
> >> Mercurial-devel at selenic.com
> >> http://selenic.com/mailman/listinfo/mercurial-devel
> >>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://selenic.com/pipermail/mercurial-devel/attachments/20121213/0cc2ae56/attachment.html>

Patch

diff --git a/hgext/speedy/client.py b/hgext/speedy/client.py
--- a/hgext/speedy/client.py
+++ b/hgext/speedy/client.py
@@ -91,18 +91,26 @@ 
         `rev` is in fncache then fncache[rev] is a list with filenames
         this rev modifies that are matching according to `match`.
 
-        If match containts only 'path' or 'relpath' patterns the query is sent
+        If match doesn't contain any fileset expressions, the query is sent
         to the server, otherwise it is performed locally as server doesn't
-        support arbitrary patterns just yet.
+        support fileset expressions.
         """
-        anynonpaths = bool(filter(lambda (k, v): k != 'path', match._pats))
-        if match._includepats or match._excludepats or anynonpaths:
-            # For now, server supports only literal paths and not arbitrary
-            # patterns, fall back to the local query
+        allpats = match._pats + match._includepats + match._excludepats
+        anyfilesetexp = bool(filter(lambda (k, v): k == 'set', allpats))
+        if anyfilesetexp:
+            # Server doesn't support filesets, fall back to the local
+            # fileset query
             return cmdutil.filterrevs(self._repo, list(self._repo), match)
         else:
-            paths = [v for k, v in match._pats]
-            wanted = self._proxy.request('path', (paths,))
+            # Would like to send the match object over the network, but
+            # it is not possible as it contains functions and working
+            # dir context object. Send all parameters used to init the
+            # object in a dict instead and create the object server-side.
+            # match._ctx field is used only if one of the patterns to match is
+            # a 'set:' pattern. Therefore we do not loose any information here.
+            matchdict = dict(patterns=match._pats, include=match._includepats,
+                    exclude=match._excludepats)
+            wanted = self._proxy.request('path', (matchdict,))
             return set(nodestorevs(self._repo, wanted)), {}
 
 def _patchedauthor(metapeer, repo, subset, pats):
diff --git a/hgext/speedy/index.py b/hgext/speedy/index.py
--- a/hgext/speedy/index.py
+++ b/hgext/speedy/index.py
@@ -58,3 +58,16 @@ 
         for path in paths:
             filechgs.setdefault(path, []).append(ctx.node())
     return filechgs
+
+def makefiles(ctxs):
+    """Return the `files` index in the form of a dict.
+
+    `files` is keyed by file name, with each value being an empty string
+
+    ctxs: an iterable with change contexts for changes from which the index
+        is to be created.
+    """
+    files = set()
+    for ctx in ctxs:
+        files.update(ctx.files())
+    return dict([(fn, '') for fn in files])
diff --git a/hgext/speedy/server.py b/hgext/speedy/server.py
--- a/hgext/speedy/server.py
+++ b/hgext/speedy/server.py
@@ -14,6 +14,7 @@ 
 from mercurial import cmdutil
 from mercurial.i18n import _
 from mercurial import util
+from mercurial import match as matchmod
 import index
 import protocol
 import tcptransport
@@ -59,10 +60,44 @@ 
         return [node for node, date in self.chgdate.iteritems()
             if matcher(date)]
 
-    def path(self, paths):
-        """Return a list of changesets that modify any of the paths.
+    def path(self, matchargs):
+        """Return a list of changesets that modify the specified paths.
 
-        Only the changes present in `subset` are returned.
+        matchargs: a dict that may contain the following parameters:
+            'patterns' - a pattern list
+            'include' - a pattern list describing additional paths
+                to include
+            'exclude' - a pattern list describing additional paths
+                to exclude
+
+        A pattern list is a an iterable with pairs of (kind, pattern),
+                kind may be any pattern kind recognized by match.match
+                constructor except for 'relpath' and 'set'.
+
+        If all patterns are literal paths, we can compute answer very
+        fast using `filechgs` index (see _literalpath method). Otherwise,
+        we have to fall back to an exhaustive search (see _patternpath
+        method).
+        """
+        pats = matchargs.get('patterns', [])
+        include = matchargs.get('include', [])
+        exclude = matchargs.get('exclude', [])
+        kinds = [k for k, v in pats]
+        if not include and not exclude and kinds == ['path'] * len(kinds):
+            paths = [ v for k, v in pats ]
+            wanted = self._literalpath(paths)
+        else:
+            def patsconvert(pats):
+                return[':'.join(p) for p in pats]
+            patterns = patsconvert(pats)
+            match = matchmod.match(self.repo.root, self.repo.root,
+                    patterns, include=patsconvert(include),
+                    exclude=patsconvert(exclude))
+            wanted = self._patternpath(match)
+        return wanted
+
+    def _literalpath(self, paths):
+        """Return a list of changesets touching any of the paths.
 
         Uses `filechgs` index which provides the mapping from paths
         (files and directories) to a list of changes touching this path.
@@ -74,10 +109,23 @@ 
             nodes.update(newnodes)
         return list(nodes)
 
+    def _patternpath(self, match):
+        """Return a list of changesets matching given files.
+
+        match: a callable that defines which files are relevant.
+               File is relevant if match(filename) == True.
+
+        Slow compared to _literalpath since it iterates through all filenames
+        in the repository history.
+        """
+        matchingfiles = filter(match, self.files.keys())
+        return self._literalpath(matchingfiles)
+
 indicecfg = {
     'userchgs': index.makeuserchgs,
     'chgdate': index.makechgdate,
     'filechgs': index.makefilechgs,
+    'files': index.makefiles,
 }
 
 def makeserver(repo):
diff --git a/tests/test-speedy.t b/tests/test-speedy.t
--- a/tests/test-speedy.t
+++ b/tests/test-speedy.t
@@ -273,10 +273,10 @@ 
   $ hg log --rev "date(10/20/2012) & user(testuser2)"
   chg1
 
-  $ cat >> $TESTTMP/localrepo/.hg/hgrc <<EOF_END
-  > [speedy]
-  > client = False
-  > EOF_END
+  $ hg log "glob:d2/*" --exclude "**.py"
+  chgx
+  chgpushed
+  chg4
 
   $ cd $TESTTMP/serverrepo