Patchwork [7,of,8] ignore: add support for including subdir .hgignores

login
register
mail settings
Submitter Durham Goode
Date May 13, 2015, 3:13 p.m.
Message ID <42bafc92b436181ba5d6.1431530001@dev2000.prn2.facebook.com>
Download mbox | patch
Permalink /patch/9041/
State Changes Requested
Headers show

Comments

Durham Goode - May 13, 2015, 3:13 p.m.
# HG changeset patch
# User Durham Goode <durham@fb.com>
# Date 1431485142 25200
#      Tue May 12 19:45:42 2015 -0700
# Node ID 42bafc92b436181ba5d6568967577a214198648d
# Parent  f058e14998c947d86ccbdd17edaf2392b4190cce
ignore: add support for including subdir .hgignores

This adds support for including sub-hgignore files in a .hgignore. The syntax is
'%include path/to/.hgignore' and is relative to the current .hgignore. The rules
inside the sub-ignore file only apply to files in that subdirectory.

At the moment we only support globs in sub-ignore files. regexs will cause an
exception. This is because we can't reliabily modify a regex to have a prefix
(ex: adding a prefix to '^foo|^bar' would require parsing the regex). In a
subsequent patch we will add a config option to allow users to opt-in to regexs
in sub-ignores if they're certain their regexs are sane.
Siddharth Agarwal - May 14, 2015, 3:03 a.m.
On 05/13/2015 08:13 AM, Durham Goode wrote:
> # HG changeset patch
> # User Durham Goode <durham@fb.com>
> # Date 1431485142 25200
> #      Tue May 12 19:45:42 2015 -0700
> # Node ID 42bafc92b436181ba5d6568967577a214198648d
> # Parent  f058e14998c947d86ccbdd17edaf2392b4190cce
> ignore: add support for including subdir .hgignores

This should have a (BC) note.

> This adds support for including sub-hgignore files in a .hgignore. The syntax is
> '%include path/to/.hgignore' and is relative to the current .hgignore.

I actually suspect a better syntax might '#include path/to/.hgignore',
so that older Mercurial doesn't try and interpret the ignore rules as a
pattern. I know this makes it different from config rules, though.

>  The rules
> inside the sub-ignore file only apply to files in that subdirectory.
>
> At the moment we only support globs in sub-ignore files. regexs will cause an
> exception. This is because we can't reliabily
(sp)
>  modify a regex to have a prefix
> (ex: adding a prefix to '^foo|^bar' would require parsing the regex). In a
> subsequent patch we will add a config option to allow users to opt-in to regexs
> in sub-ignores if they're certain their regexs are sane.
>
> diff --git a/mercurial/ignore.py b/mercurial/ignore.py
> --- a/mercurial/ignore.py
> +++ b/mercurial/ignore.py
> @@ -19,6 +19,13 @@ def ignorepats(ui, root, filepath):
>      syntax = 'relre:'
>      patterns = []
>      warnings = []
> +    subpaths = set()
> +
> +    common = os.path.commonprefix([filepath, root])
> +    subdir = None
> +    if root and common == root:
> +        # Find the prefix that should be applied to these patterns
> +        subdir = os.path.dirname(filepath[len(root) + 1:])
>  
>      fp = open(filepath)
>  
> @@ -35,6 +42,18 @@ def ignorepats(ui, root, filepath):
>          if not line:
>              continue
>  
> +        if line.startswith('%include '):
> +            subpath = line[9:]
> +            fullpath = os.path.join(os.path.dirname(filepath), subpath)
> +            fullpath = os.path.normpath(fullpath)
> +            subpats, subsubpaths = readignorefile(ui, root, fullpath)

'subpats' is close enough to 'subpath' that my eyes crossed over while
reading this. Could we change one of the names?

> +
> +            subpaths.add(fullpath)
> +            subpaths.update(subsubpaths)
> +
> +            patterns.extend(subpats)
> +            continue
> +
>          if line.startswith('syntax:'):
>              s = line[7:].strip()
>              try:
> @@ -53,22 +72,33 @@ def ignorepats(ui, root, filepath):
>                  linesyntax = rels
>                  line = line[len(s) + 1:]
>                  break
> +
> +        if subdir:
> +            # The ignore file is inside the repo, so make its patterns
> +            # relative to its location.
> +            if linesyntax == 'relre:':
> +                message = _('cannot use sub-ignore files that use regular '
> +                            'expressions')
> +                raise util.Abort(message)
> +            elif linesyntax == 'relglob:':
> +                line = '%s/*%s' % (subdir, line)

What happens if linesyntax is something else?

>          patterns.append(linesyntax + line)
>  
>      fp.close()
> -    return patterns, warnings
> +    return patterns, warnings, subpaths
>  
>  def readignorefile(ui, root, filepath, skipwarning=False):
>      try:
>          pats = []
> -        pats, warnings = ignorepats(ui, root, filepath)
> +        subpaths = set()
> +        pats, warnings, subpaths = ignorepats(ui, root, filepath)
>          for warning in warnings:
>              ui.warn("%s: %s\n" % (filepath, warning))
>      except IOError, inst:
>          if not skipwarning:
>              ui.warn(_("skipping unreadable ignore file '%s': %s\n") %
>                   (filepath, inst.strerror))
> -    return pats
> +    return pats, subpaths
>  
>  def readpats(ui, root, files):
>      '''return a dict mapping ignore-file-name to list-of-patterns'''
> @@ -78,10 +108,16 @@ def readpats(ui, root, files):
>      for f in files:
>          if f in pats:
>              continue
> -        skipwarning = f == files[0]
>          fullpath = os.path.normpath(os.path.join(root, f))
>          paths.add(fullpath)
> -        pats[f] = readignorefile(ui, root, fullpath, skipwarning=skipwarning)
> +        relativeroot = ''
> +        skipwarning = False
> +        if f == files[0]:
> +            relativeroot = root
> +            skipwarning = True
> +        pats[f], subpaths = readignorefile(ui, relativeroot, fullpath,
> +                                           skipwarning=skipwarning)
> +        paths.update(subpaths)
>  
>      return [(f, pats[f]) for f in files if f in pats], paths
>  
> diff --git a/tests/test-hgignore.t b/tests/test-hgignore.t
> --- a/tests/test-hgignore.t
> +++ b/tests/test-hgignore.t
> @@ -167,3 +167,48 @@ Check recursive glob pattern matches no 
>    ? a.c
>    ? a.o
>    ? syntax
> +
> +Check including sub-ignores
> +
> +  $ echo "" > .hgignore
> +  $ hg revert -q --all
> +  $ hg purge --all --config extensions.purge=
> +  $ mkdir dir1 dir2
> +  $ touch dir1/file1 dir1/file2 dir2/file1 dir2/file2
> +  $ echo "%include dir1/.hgignore" >> .hgignore
> +  $ echo "%include dir2/.hgignore" >> .hgignore
> +  $ echo "glob:file*2" > dir2/.hgignore
> +  $ hg status | grep -v hgignore
> +  skipping unreadable ignore file '$TESTTMP/dir1/.hgignore': No such file or directory
> +  ? dir1/file1
> +  ? dir1/file2
> +  ? dir2/file1
> +
> +Check including sub-ignores with regexs
> +
> +  $ echo "regexp:f.le1" > dir1/.hgignore
> +  $ hg debugignore
> +  abort: cannot use sub-ignore files that use regular expressions
> +  [255]
> +
> +  $ echo "" > dir1/.hgignore
> +
> +  $ hg debugignore
> +  (?:(?:|.*/)dir2\/[^/]*file[^/]*2(?:/|$))
> +
> +Check multiple levels of sub-ignores
> +
> +  $ mkdir dir1/subdir
> +  $ touch dir1/subdir/subfile1 dir1/subdir/subfile3 dir1/subdir/subfile4
> +  $ echo "%include subdir/.hgignore" >> dir1/.hgignore
> +  $ echo "glob:fil*3" >> dir1/subdir/.hgignore
> +
> +  $ hg debugignore
> +  (?:(?:|.*/)dir1\/subdir\/[^/]*fil[^/]*3(?:/|$)|(?:|.*/)dir2\/[^/]*file[^/]*2(?:/|$))
> +
> +  $ hg status | grep -v hgignore
> +  ? dir1/file1
> +  ? dir1/file2
> +  ? dir1/subdir/subfile1
> +  ? dir1/subdir/subfile4
> +  ? dir2/file1
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
Durham Goode - May 14, 2015, 8:57 p.m.
On 5/13/15, 8:03 PM, "Siddharth Agarwal" <sid@less-broken.com> wrote:

>On 05/13/2015 08:13 AM, Durham Goode wrote:
>> # HG changeset patch
>> # User Durham Goode <durham@fb.com>
>> # Date 1431485142 25200
>> #      Tue May 12 19:45:42 2015 -0700
>> # Node ID 42bafc92b436181ba5d6568967577a214198648d
>> # Parent  f058e14998c947d86ccbdd17edaf2392b4190cce
>> ignore: add support for including subdir .hgignores
>
>This should have a (BC) note.
>
>> This adds support for including sub-hgignore files in a .hgignore. The
>>syntax is
>> '%include path/to/.hgignore' and is relative to the current .hgignore.
>
>I actually suspect a better syntax might '#include path/to/.hgignore',
>so that older Mercurial doesn't try and interpret the ignore rules as a
>pattern. I know this makes it different from config rules, though.

Talked a bit in IRC about this.  Moving to a
'subinclude:path/to/.hgignore' style syntax might be nice, since then
users could use a similar pattern of 'include:path/to/.hgignore' with -I
and -X and stuff.  I'll see if I can make that happen, then resend.

>
>>  The rules
>> inside the sub-ignore file only apply to files in that subdirectory.
>>
>> At the moment we only support globs in sub-ignore files. regexs will
>>cause an
>> exception. This is because we can't reliabily
>(sp)
>>  modify a regex to have a prefix
>> (ex: adding a prefix to '^foo|^bar' would require parsing the regex).
>>In a
>> subsequent patch we will add a config option to allow users to opt-in
>>to regexs
>> in sub-ignores if they're certain their regexs are sane.
>>
>> diff --git a/mercurial/ignore.py b/mercurial/ignore.py
>> --- a/mercurial/ignore.py
>> +++ b/mercurial/ignore.py
>> @@ -19,6 +19,13 @@ def ignorepats(ui, root, filepath):
>>      syntax = 'relre:'
>>      patterns = []
>>      warnings = []
>> +    subpaths = set()
>> +
>> +    common = os.path.commonprefix([filepath, root])
>> +    subdir = None
>> +    if root and common == root:
>> +        # Find the prefix that should be applied to these patterns
>> +        subdir = os.path.dirname(filepath[len(root) + 1:])
>>  
>>      fp = open(filepath)
>>  
>> @@ -35,6 +42,18 @@ def ignorepats(ui, root, filepath):
>>          if not line:
>>              continue
>>  
>> +        if line.startswith('%include '):
>> +            subpath = line[9:]
>> +            fullpath = os.path.join(os.path.dirname(filepath), subpath)
>> +            fullpath = os.path.normpath(fullpath)
>> +            subpats, subsubpaths = readignorefile(ui, root, fullpath)
>
>'subpats' is close enough to 'subpath' that my eyes crossed over while
>reading this. Could we change one of the names?
>
>> +
>> +            subpaths.add(fullpath)
>> +            subpaths.update(subsubpaths)
>> +
>> +            patterns.extend(subpats)
>> +            continue
>> +
>>          if line.startswith('syntax:'):
>>              s = line[7:].strip()
>>              try:
>> @@ -53,22 +72,33 @@ def ignorepats(ui, root, filepath):
>>                  linesyntax = rels
>>                  line = line[len(s) + 1:]
>>                  break
>> +
>> +        if subdir:
>> +            # The ignore file is inside the repo, so make its patterns
>> +            # relative to its location.
>> +            if linesyntax == 'relre:':
>> +                message = _('cannot use sub-ignore files that use
>>regular '
>> +                            'expressions')
>> +                raise util.Abort(message)
>> +            elif linesyntax == 'relglob:':
>> +                line = '%s/*%s' % (subdir, line)
>
>What happens if linesyntax is something else?

I don't think ignore files ever worked with anything besides glob and
regex. But if it's something else, we'll just not prefix it.
Martin Geisler - May 15, 2015, 10:22 a.m.
Durham Goode <durham@fb.com> writes:

> At the moment we only support globs in sub-ignore files. regexs will
> cause an exception. This is because we can't reliabily modify a regex
> to have a prefix (ex: adding a prefix to '^foo|^bar' would require
> parsing the regex).

That is surprising from a high-level since regular languages are closed
under concatenation.

However, I see what you're saying: blindly adding a prefix to a regex
doesn't do what you expect. I didn't look at the mechanics of the code,
but if you could strip off the path elements as you descend down the
directory tree, you should get the right behavior.

So if we have

  root/sub-dir/foo/
               bar/
               .hgignore <- '^foo|^bar'
               baz.txt
       .hgignore <- '#include sub-dir/.hgignore'

then don't match

  sub-dir/foo/
  sub-dir/bar/

against

  sub-dir/(^foo|^bar)

Instead match

  foo/
  bar/

against

  ^foo|^bar

to conclude the directories should be ignored.

Maybe that's not what you want if you intend to make one big regular
expression upfront and run all paths through that.
Martin von Zweigbergk - May 15, 2015, 1:09 p.m.
I agree with the other Martin. I'm not at a computer now, so it's hard to
check, but where are the ignore patterns used? I know it's used while
walking the working copy in dirstate.py. I would think that that piece of
code would not mind if it had to create a new union matcher every time it
visited a subdirectory mentioned in an include. Would applying the matcher
to only the path within the subdirectory be a problem performance-wise?

Are the .hgignore files always read from the working copy? If not, it would
be nice to not have to read all the submanifests (when using such) for
operations that care only about some subdirectory.

Also, does the subdirectory .hgignore have to be called exactly .hgignore?
I'm just wondering at this point; I haven't decided whether I think it
should be required or not.

On Fri, May 15, 2015, 03:23 Martin Geisler <martin@geisler.net> wrote:

> Durham Goode <durham@fb.com> writes:
>
> > At the moment we only support globs in sub-ignore files. regexs will
> > cause an exception. This is because we can't reliabily modify a regex
> > to have a prefix (ex: adding a prefix to '^foo|^bar' would require
> > parsing the regex).
>
> That is surprising from a high-level since regular languages are closed
> under concatenation.
>
> However, I see what you're saying: blindly adding a prefix to a regex
> doesn't do what you expect. I didn't look at the mechanics of the code,
> but if you could strip off the path elements as you descend down the
> directory tree, you should get the right behavior.
>
> So if we have
>
>   root/sub-dir/foo/
>                bar/
>                .hgignore <- '^foo|^bar'
>                baz.txt
>        .hgignore <- '#include sub-dir/.hgignore'
>
> then don't match
>
>   sub-dir/foo/
>   sub-dir/bar/
>
> against
>
>   sub-dir/(^foo|^bar)
>
> Instead match
>
>   foo/
>   bar/
>
> against
>
>   ^foo|^bar
>
> to conclude the directories should be ignored.
>
> Maybe that's not what you want if you intend to make one big regular
> expression upfront and run all paths through that.
>
> --
> Martin Geisler
>
> http://google.com/+MartinGeisler
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
>
Durham Goode - May 15, 2015, 6:22 p.m.
On 5/15/15 6:09 AM, Martin von Zweigbergk wrote:
>
> I agree with the other Martin. I'm not at a computer now, so it's hard 
> to check, but where are the ignore patterns used? I know it's used 
> while walking the working copy in dirstate.py. I would think that that 
> piece of code would not mind if it had to create a new union matcher 
> every time it visited a subdirectory mentioned in an include. Would 
> applying the matcher to only the path within the subdirectory be a 
> problem performance-wise?
>
The matcher is a bit of a hotpath (our working copies can have 500,000+ 
files) and right now every match is done entirely in the native re2 
code.  I worry that adding additional string and python logic to every 
path match would have a perf impact.  I guess I can hack something up to 
test it.

 From a simple grepping, ignores seem to only be used for dirstate.walk.
>
> Are the .hgignore files always read from the working copy? If not, it 
> would be nice to not have to read all the submanifests (when using 
> such) for operations that care only about some subdirectory.
>
 From what I can tell, it's always read from the working copy.
>
> Also, does the subdirectory .hgignore have to be called exactly 
> .hgignore? I'm just wondering at this point; I haven't decided whether 
> I think it should be required or not.
>
Nope, it can be called whatever you want.
>
>
> On Fri, May 15, 2015, 03:23 Martin Geisler <martin@geisler.net 
> <mailto:martin@geisler.net>> wrote:
>
>     Durham Goode <durham@fb.com <mailto:durham@fb.com>> writes:
>
>     > At the moment we only support globs in sub-ignore files. regexs will
>     > cause an exception. This is because we can't reliabily modify a
>     regex
>     > to have a prefix (ex: adding a prefix to '^foo|^bar' would require
>     > parsing the regex).
>
>     That is surprising from a high-level since regular languages are
>     closed
>     under concatenation.
>
>     However, I see what you're saying: blindly adding a prefix to a regex
>     doesn't do what you expect. I didn't look at the mechanics of the
>     code,
>     but if you could strip off the path elements as you descend down the
>     directory tree, you should get the right behavior.
>
>     So if we have
>
>       root/sub-dir/foo/
>                    bar/
>                    .hgignore <- '^foo|^bar'
>                    baz.txt
>            .hgignore <- '#include sub-dir/.hgignore'
>
>     then don't match
>
>       sub-dir/foo/
>       sub-dir/bar/
>
>     against
>
>       sub-dir/(^foo|^bar)
>
>     Instead match
>
>       foo/
>       bar/
>
>     against
>
>       ^foo|^bar
>
>     to conclude the directories should be ignored.
>
>     Maybe that's not what you want if you intend to make one big regular
>     expression upfront and run all paths through that.
>
>     --
>     Martin Geisler
>
>     http://google.com/+MartinGeisler
>     <https://urldefense.proofpoint.com/v1/url?u=http://google.com/%2BMartinGeisler&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=pHOG6Hz51SkYmYr%2FxoTFzw%3D%3D%0A&m=OSwxzUTuacvvTTlpHg4tkWDQsomewE5wIJjBQqpEva0%3D%0A&s=0ac94e73b8a813d7fb289cad58251579cae1285ec40b11360fb9c4500c352fa7>
>     _______________________________________________
>     Mercurial-devel mailing list
>     Mercurial-devel@selenic.com <mailto:Mercurial-devel@selenic.com>
>     http://selenic.com/mailman/listinfo/mercurial-devel
>

Patch

diff --git a/mercurial/ignore.py b/mercurial/ignore.py
--- a/mercurial/ignore.py
+++ b/mercurial/ignore.py
@@ -19,6 +19,13 @@  def ignorepats(ui, root, filepath):
     syntax = 'relre:'
     patterns = []
     warnings = []
+    subpaths = set()
+
+    common = os.path.commonprefix([filepath, root])
+    subdir = None
+    if root and common == root:
+        # Find the prefix that should be applied to these patterns
+        subdir = os.path.dirname(filepath[len(root) + 1:])
 
     fp = open(filepath)
 
@@ -35,6 +42,18 @@  def ignorepats(ui, root, filepath):
         if not line:
             continue
 
+        if line.startswith('%include '):
+            subpath = line[9:]
+            fullpath = os.path.join(os.path.dirname(filepath), subpath)
+            fullpath = os.path.normpath(fullpath)
+            subpats, subsubpaths = readignorefile(ui, root, fullpath)
+
+            subpaths.add(fullpath)
+            subpaths.update(subsubpaths)
+
+            patterns.extend(subpats)
+            continue
+
         if line.startswith('syntax:'):
             s = line[7:].strip()
             try:
@@ -53,22 +72,33 @@  def ignorepats(ui, root, filepath):
                 linesyntax = rels
                 line = line[len(s) + 1:]
                 break
+
+        if subdir:
+            # The ignore file is inside the repo, so make its patterns
+            # relative to its location.
+            if linesyntax == 'relre:':
+                message = _('cannot use sub-ignore files that use regular '
+                            'expressions')
+                raise util.Abort(message)
+            elif linesyntax == 'relglob:':
+                line = '%s/*%s' % (subdir, line)
         patterns.append(linesyntax + line)
 
     fp.close()
-    return patterns, warnings
+    return patterns, warnings, subpaths
 
 def readignorefile(ui, root, filepath, skipwarning=False):
     try:
         pats = []
-        pats, warnings = ignorepats(ui, root, filepath)
+        subpaths = set()
+        pats, warnings, subpaths = ignorepats(ui, root, filepath)
         for warning in warnings:
             ui.warn("%s: %s\n" % (filepath, warning))
     except IOError, inst:
         if not skipwarning:
             ui.warn(_("skipping unreadable ignore file '%s': %s\n") %
                  (filepath, inst.strerror))
-    return pats
+    return pats, subpaths
 
 def readpats(ui, root, files):
     '''return a dict mapping ignore-file-name to list-of-patterns'''
@@ -78,10 +108,16 @@  def readpats(ui, root, files):
     for f in files:
         if f in pats:
             continue
-        skipwarning = f == files[0]
         fullpath = os.path.normpath(os.path.join(root, f))
         paths.add(fullpath)
-        pats[f] = readignorefile(ui, root, fullpath, skipwarning=skipwarning)
+        relativeroot = ''
+        skipwarning = False
+        if f == files[0]:
+            relativeroot = root
+            skipwarning = True
+        pats[f], subpaths = readignorefile(ui, relativeroot, fullpath,
+                                           skipwarning=skipwarning)
+        paths.update(subpaths)
 
     return [(f, pats[f]) for f in files if f in pats], paths
 
diff --git a/tests/test-hgignore.t b/tests/test-hgignore.t
--- a/tests/test-hgignore.t
+++ b/tests/test-hgignore.t
@@ -167,3 +167,48 @@  Check recursive glob pattern matches no 
   ? a.c
   ? a.o
   ? syntax
+
+Check including sub-ignores
+
+  $ echo "" > .hgignore
+  $ hg revert -q --all
+  $ hg purge --all --config extensions.purge=
+  $ mkdir dir1 dir2
+  $ touch dir1/file1 dir1/file2 dir2/file1 dir2/file2
+  $ echo "%include dir1/.hgignore" >> .hgignore
+  $ echo "%include dir2/.hgignore" >> .hgignore
+  $ echo "glob:file*2" > dir2/.hgignore
+  $ hg status | grep -v hgignore
+  skipping unreadable ignore file '$TESTTMP/dir1/.hgignore': No such file or directory
+  ? dir1/file1
+  ? dir1/file2
+  ? dir2/file1
+
+Check including sub-ignores with regexs
+
+  $ echo "regexp:f.le1" > dir1/.hgignore
+  $ hg debugignore
+  abort: cannot use sub-ignore files that use regular expressions
+  [255]
+
+  $ echo "" > dir1/.hgignore
+
+  $ hg debugignore
+  (?:(?:|.*/)dir2\/[^/]*file[^/]*2(?:/|$))
+
+Check multiple levels of sub-ignores
+
+  $ mkdir dir1/subdir
+  $ touch dir1/subdir/subfile1 dir1/subdir/subfile3 dir1/subdir/subfile4
+  $ echo "%include subdir/.hgignore" >> dir1/.hgignore
+  $ echo "glob:fil*3" >> dir1/subdir/.hgignore
+
+  $ hg debugignore
+  (?:(?:|.*/)dir1\/subdir\/[^/]*fil[^/]*3(?:/|$)|(?:|.*/)dir2\/[^/]*file[^/]*2(?:/|$))
+
+  $ hg status | grep -v hgignore
+  ? dir1/file1
+  ? dir1/file2
+  ? dir1/subdir/subfile1
+  ? dir1/subdir/subfile4
+  ? dir2/file1