Submitter | Alastair Houghton |
---|---|
Date | Dec. 6, 2012, 2:44 p.m. |
Message ID | <2436a24abdb9c4f6cc88.1354805050@lithium.coriolis-systems.com> |
Download | mbox | patch |
Permalink | /patch/26/ |
State | Superseded |
Headers | show |
Comments
On Thu, 2012-12-06 at 14:44 +0000, Alastair Houghton wrote: > # HG changeset patch > # User Alastair Houghton <alastair at coriolis-systems.com> > # Date 1354804714 0 > # Node ID 2436a24abdb9c4f6cc880dd1194d6f235b82f28c > # Parent 9b05b31b413c55d750ca8be41330ff65fa9ed8c1 > patch: improve handling of filenames containing spaces on import (issue3723) > Due to the lack of quoting or escaping in typical diff output, it is Between your bug report and this description, there's lots of confusion about whether you're talking about git diffs, standard diffs, or diffs in general. I've been assuming the latter up to now. Can I please get you to rephrase this description to be precise about its scope (and not muddy the waters with anything outside that scope)? In particular, this doesn't mention git in the summary, and mentions 'typical diffs', but apparently only touches git diffs. One is left unclear on whether spaces in filenames are an issue for diffs generally (and thus whether 3723 is addressed, due to same ambiguity). > difficult to parse certain patches properly, especially when the > filename(s) involved contain spaces. This becomes especially problematic > with git-format diffs if one of the filenames contains a path ending in " b/", > as that causes lines like the following to appear in the patch: > > diff --git a/foo b/foo.txt b/foo b/foo.txt > > Mercurial already copes with renames and copies by examining the additional > headers that are part of the git diff output. This patch fixes it to work > with other kinds of git diff, by looking to see whether or not the patch > being examined is a rename or a copy and if it is not, re-parsing the diff > line on the assumption that the name must be the same in both places. > > In order to do this properly, the iterhunks() function has been modified > to defer git patch handling until it hits the "---" or "GIT binary patch" > line---that way, it will know whether or not the diff line it was looking > at is a rename or a copy. > > diff --git a/mercurial/patch.py b/mercurial/patch.py > --- a/mercurial/patch.py > +++ b/mercurial/patch.py > @@ -15,6 +15,7 @@ > import context > > gitre = re.compile('diff --git a/(.*) b/(.*)') > +gitre_no_namechange = re.compile(r'diff --git a/(.*) b/\1$') > > class PatchError(Exception): > pass > @@ -270,7 +271,8 @@ > 'islink' is True if the file is a symlink and 'isexec' is True if > the file is executable. Otherwise, 'mode' is None. > """ > - def __init__(self, path): > + def __init__(self, path, diffline=None): > + self.diffline = diffline > self.path = path > self.oldpath = None > self.mode = None > @@ -283,13 +285,26 @@ > self.mode = (islink, isexec) > > def copy(self): > - other = patchmeta(self.path) > + other = patchmeta(self.path, diffline=self.diffline) > other.oldpath = self.oldpath > other.mode = self.mode > other.op = self.op > other.binary = self.binary > return other > > + def reparse(self): > + if self.op == 'RENAME' or self.op == 'COPY': > + return True > + > + # reparse the diffline with a stricter regex > + m = gitre_no_namechange.match(self.diffline) > + if not m: > + return False > + > + self.path = m.group(1) > + > + return True > + > def _ispatchinga(self, afile): > if afile == '/dev/null': > return self.op == 'ADD' > @@ -317,16 +332,16 @@ > if line.startswith('diff --git'): > m = gitre.match(line) > if m: > - if gp: > + if gp and gp.reparse(): > gitpatches.append(gp) > dst = m.group(2) > - gp = patchmeta(dst) > + gp = patchmeta(dst, diffline=line) > elif gp: > if line.startswith('--- '): > - gitpatches.append(gp) > + if gp.reparse(): > + gitpatches.append(gp) > gp = None > - continue > - if line.startswith('rename from '): > + elif line.startswith('rename from '): > gp.op = 'RENAME' > gp.oldpath = line[12:] > elif line.startswith('rename to '): > @@ -345,7 +360,11 @@ > gp.setmode(int(line[-6:], 8)) > elif line.startswith('GIT binary patch'): > gp.binary = True > - if gp: > + if gp.reparse(): > + gitpatches.append(gp) > + gp = None > + > + if gp and gp.reparse(): > gitpatches.append(gp) > > return gitpatches > @@ -1189,6 +1208,11 @@ > > # our states > BFILE = 1 > + > + scanninggit = False > + diffline = None > + gitop = None > + > context = None > lr = linereader(fp) > > @@ -1196,6 +1220,51 @@ > x = lr.readline() > if not x: > break > + > + if scanninggit: > + # git format patch headers end at --- or GIT binary patch > + if x.startswith('---') or x.startswith('GIT binary patch'): > + scanninggit = False > + > + ok = True > + if gitop != 'RENAME' and gitop != 'COPY': > + # use the stricter regex > + m = gitre_no_namechange.match(diffline) > + if m: > + afile = 'a/' + m.group(1) > + bfile = 'b/' + m.group(1) > + else: > + ok = False > + > + if ok: > + while gitpatches \ > + and not gitpatches[-1].ispatching(afile, bfile): > + gp = gitpatches.pop() > + yield 'file', ('a/' + gp.path, 'b/' + gp.path, None, > + gp.copy()) > + if not gitpatches: > + raise PatchError(_('failed to synchronize metadata ' > + 'for "%s"') > + % afile[2:]) > + gp = gitpatches[-1] > + newfile = True > + elif x.startswith('rename from '): > + gitop = 'RENAME' > + afile = 'a/' + x[12:].rstrip(' \r\n') > + elif x.startswith('rename to '): > + bfile = 'b/' + x[10:].rstrip(' \r\n') > + elif x.startswith('copy from '): > + gitop = 'COPY' > + afile = 'a/' + x[10:].rstrip(' \r\n') > + elif x.startswith('copy to '): > + bfile = 'b/' + x[8:].rstrip(' \r\n') > + > + if newfile: > + newfile = False > + emitfile = True > + state = BFILE > + hunknum = 0 > + > if state == BFILE and ( > (not context and x[0] == '@') > or (context is not False and x.startswith('***************')) > @@ -1216,25 +1285,26 @@ > yield 'file', (afile, bfile, h, gp and gp.copy() or None) > yield 'hunk', h > elif x.startswith('diff --git'): > - m = gitre.match(x.rstrip(' \r\n')) > + xs = x.rstrip(' \r\n') > + m = gitre.match(xs) > if not m: > continue > + > if gitpatches is None: > # scan whole input for git metadata > gitpatches = scangitpatch(lr, x) > yield 'git', [g.copy() for g in gitpatches > if g.op in ('COPY', 'RENAME')] > gitpatches.reverse() > + > + # defer processing of git metadata until we know for certain > + # the names of the files (we need to see the metadata for this > + # patch before we really know what they are) > + scanninggit = True > + diffline = xs > afile = 'a/' + m.group(1) > bfile = 'b/' + m.group(2) > - while gitpatches and not gitpatches[-1].ispatching(afile, bfile): > - gp = gitpatches.pop() > - yield 'file', ('a/' + gp.path, 'b/' + gp.path, None, gp.copy()) > - if not gitpatches: > - raise PatchError(_('failed to synchronize metadata for "%s"') > - % afile[2:]) > - gp = gitpatches[-1] > - newfile = True > + gitop = 'MODIFY' > elif x.startswith('---'): > # check for a unified diff > l2 = lr.readline() > @@ -1261,11 +1331,6 @@ > afile = parsefilename(x) > bfile = parsefilename(l2) > > - if newfile: > - newfile = False > - emitfile = True > - state = BFILE > - hunknum = 0 > > while gitpatches: > gp = gitpatches.pop()
On 6 Dec 2012, at 18:55, Matt Mackall <mpm at selenic.com> wrote: > Between your bug report and this description, there's lots of confusion > about whether you're talking about git diffs, standard diffs, or diffs > in general. I've been assuming the latter up to now. Well, it is a general problem, but I certainly hadn't intended to try to fix it for the general case because there isn't a good general solution. It seems to me that you can only reliably solve it for the git format; you *might* get away with a fix for non-git-format unified/contextual diffs that parsed the --- and +++ lines to grab the filename(s) from there, though I imagine there might be problems separating the name and the date/time and/or extra information that some tools put there. Anyway, I didn't specifically set out to fix any of this; I was just looking at the Mercurial code in this area to see how Mercurial went about parsing patches and generating diffs, mainly because I want to parse some diffs myself and wanted to know what formats I was likely to encounter. Sadly there doesn't seem to be any written specification that I could find that actually defines the diff formats in any detail---the most detailed thing I can find is the Single Unix Specification, which appears in some respects to be inaccurate. I should also add that, contrary to the impression that the few people who have commented on the bug report appear to have, I was not and have never been wedded to the idea of changing Mercurial's output. It just seemed logical that git format should generally behave the same way git does, and since that fixes the problem it seemed a sensible idea. Perhaps I hadn't appreciated it would be such a contentious topic. Oh, one other thing I realised about my patch after sending it---it currently doesn't fix the diffstat functions. Kind regards, Alastair. -- http://alastairs-place.net
Patch
diff --git a/mercurial/patch.py b/mercurial/patch.py --- a/mercurial/patch.py +++ b/mercurial/patch.py @@ -15,6 +15,7 @@ import context gitre = re.compile('diff --git a/(.*) b/(.*)') +gitre_no_namechange = re.compile(r'diff --git a/(.*) b/\1$') class PatchError(Exception): pass @@ -270,7 +271,8 @@ 'islink' is True if the file is a symlink and 'isexec' is True if the file is executable. Otherwise, 'mode' is None. """ - def __init__(self, path): + def __init__(self, path, diffline=None): + self.diffline = diffline self.path = path self.oldpath = None self.mode = None @@ -283,13 +285,26 @@ self.mode = (islink, isexec) def copy(self): - other = patchmeta(self.path) + other = patchmeta(self.path, diffline=self.diffline) other.oldpath = self.oldpath other.mode = self.mode other.op = self.op other.binary = self.binary return other + def reparse(self): + if self.op == 'RENAME' or self.op == 'COPY': + return True + + # reparse the diffline with a stricter regex + m = gitre_no_namechange.match(self.diffline) + if not m: + return False + + self.path = m.group(1) + + return True + def _ispatchinga(self, afile): if afile == '/dev/null': return self.op == 'ADD' @@ -317,16 +332,16 @@ if line.startswith('diff --git'): m = gitre.match(line) if m: - if gp: + if gp and gp.reparse(): gitpatches.append(gp) dst = m.group(2) - gp = patchmeta(dst) + gp = patchmeta(dst, diffline=line) elif gp: if line.startswith('--- '): - gitpatches.append(gp) + if gp.reparse(): + gitpatches.append(gp) gp = None - continue - if line.startswith('rename from '): + elif line.startswith('rename from '): gp.op = 'RENAME' gp.oldpath = line[12:] elif line.startswith('rename to '): @@ -345,7 +360,11 @@ gp.setmode(int(line[-6:], 8)) elif line.startswith('GIT binary patch'): gp.binary = True - if gp: + if gp.reparse(): + gitpatches.append(gp) + gp = None + + if gp and gp.reparse(): gitpatches.append(gp) return gitpatches @@ -1189,6 +1208,11 @@ # our states BFILE = 1 + + scanninggit = False + diffline = None + gitop = None + context = None lr = linereader(fp) @@ -1196,6 +1220,51 @@ x = lr.readline() if not x: break + + if scanninggit: + # git format patch headers end at --- or GIT binary patch + if x.startswith('---') or x.startswith('GIT binary patch'): + scanninggit = False + + ok = True + if gitop != 'RENAME' and gitop != 'COPY': + # use the stricter regex + m = gitre_no_namechange.match(diffline) + if m: + afile = 'a/' + m.group(1) + bfile = 'b/' + m.group(1) + else: + ok = False + + if ok: + while gitpatches \ + and not gitpatches[-1].ispatching(afile, bfile): + gp = gitpatches.pop() + yield 'file', ('a/' + gp.path, 'b/' + gp.path, None, + gp.copy()) + if not gitpatches: + raise PatchError(_('failed to synchronize metadata ' + 'for "%s"') + % afile[2:]) + gp = gitpatches[-1] + newfile = True + elif x.startswith('rename from '): + gitop = 'RENAME' + afile = 'a/' + x[12:].rstrip(' \r\n') + elif x.startswith('rename to '): + bfile = 'b/' + x[10:].rstrip(' \r\n') + elif x.startswith('copy from '): + gitop = 'COPY' + afile = 'a/' + x[10:].rstrip(' \r\n') + elif x.startswith('copy to '): + bfile = 'b/' + x[8:].rstrip(' \r\n') + + if newfile: + newfile = False + emitfile = True + state = BFILE + hunknum = 0 + if state == BFILE and ( (not context and x[0] == '@') or (context is not False and x.startswith('***************')) @@ -1216,25 +1285,26 @@ yield 'file', (afile, bfile, h, gp and gp.copy() or None) yield 'hunk', h elif x.startswith('diff --git'): - m = gitre.match(x.rstrip(' \r\n')) + xs = x.rstrip(' \r\n') + m = gitre.match(xs) if not m: continue + if gitpatches is None: # scan whole input for git metadata gitpatches = scangitpatch(lr, x) yield 'git', [g.copy() for g in gitpatches if g.op in ('COPY', 'RENAME')] gitpatches.reverse() + + # defer processing of git metadata until we know for certain + # the names of the files (we need to see the metadata for this + # patch before we really know what they are) + scanninggit = True + diffline = xs afile = 'a/' + m.group(1) bfile = 'b/' + m.group(2) - while gitpatches and not gitpatches[-1].ispatching(afile, bfile): - gp = gitpatches.pop() - yield 'file', ('a/' + gp.path, 'b/' + gp.path, None, gp.copy()) - if not gitpatches: - raise PatchError(_('failed to synchronize metadata for "%s"') - % afile[2:]) - gp = gitpatches[-1] - newfile = True + gitop = 'MODIFY' elif x.startswith('---'): # check for a unified diff l2 = lr.readline() @@ -1261,11 +1331,6 @@ afile = parsefilename(x) bfile = parsefilename(l2) - if newfile: - newfile = False - emitfile = True - state = BFILE - hunknum = 0 while gitpatches: gp = gitpatches.pop()