Submitter | Durham Goode |
---|---|
Date | Nov. 14, 2016, 11:27 p.m. |
Message ID | <27209d52a5865422c5ef.1479166029@dev111.prn1.facebook.com> |
Download | mbox | patch |
Permalink | /patch/17566/ |
State | Accepted |
Headers | show |
Comments
On Mon, Nov 14, 2016 at 3:27 PM, Durham Goode <durham@fb.com> wrote: > # HG changeset patch > # User Durham Goode <durham@fb.com> > # Date 1479165447 28800 > # Mon Nov 14 15:17:27 2016 -0800 > # Node ID 27209d52a5865422c5ef4ba05cedb28ce32919ed > # Parent 046a7e828ea63ec940ffae1089a33fae7954da2e > manifest: make revlog verification optional > > This patches adds an parameter to manifestlog.get() to disable hash checking. > This will be used in an upcoming patch to support treemanifestctx reading > sub-trees without loading them from the revlog. (This is already supported but > does not go through the manifestlog.get() code path) > > diff --git a/mercurial/manifest.py b/mercurial/manifest.py > --- a/mercurial/manifest.py > +++ b/mercurial/manifest.py > @@ -1278,9 +1278,12 @@ class manifestlog(object): > """ > return self.get('', node) > > - def get(self, dir, node): > + def get(self, dir, node, verify=True): > """Retrieves the manifest instance for the given node. Throws a > LookupError if not found. > + > + `verify` - if True an exception will be thrown if the node is not in > + the revlog Patch 2/2 says "Set verify to False since we need to be able to create subtrees for trees that don't exist on disk yet.", which makes it sounds like it's only about reading empty revlogs, which should be safe anyway. What am I missing? > """ > if node in self._dirmancache.get(dir, ()): > cachemf = self._dirmancache[dir][node] > @@ -1292,19 +1295,21 @@ class manifestlog(object): > > if dir: > if self._revlog._treeondisk: > - dirlog = self._revlog.dirlog(dir) > - if node not in dirlog.nodemap: > - raise LookupError(node, dirlog.indexfile, > - _('no node')) > + if verify: > + dirlog = self._revlog.dirlog(dir) > + if node not in dirlog.nodemap: > + raise LookupError(node, dirlog.indexfile, > + _('no node')) > m = treemanifestctx(self._repo, dir, node) > else: > raise error.Abort( > _("cannot ask for manifest directory '%s' in a flat " > "manifest") % dir) > else: > - if node not in self._revlog.nodemap: > - raise LookupError(node, self._revlog.indexfile, > - _('no node')) > + if verify: > + if node not in self._revlog.nodemap: > + raise LookupError(node, self._revlog.indexfile, > + _('no node')) > if self._treeinmem: > m = treemanifestctx(self._repo, '', node) > else: > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@mercurial-scm.org > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
On 11/14/16 4:23 PM, Martin von Zweigbergk wrote: > On Mon, Nov 14, 2016 at 3:27 PM, Durham Goode <durham@fb.com> wrote: >> # HG changeset patch >> # User Durham Goode <durham@fb.com> >> # Date 1479165447 28800 >> # Mon Nov 14 15:17:27 2016 -0800 >> # Node ID 27209d52a5865422c5ef4ba05cedb28ce32919ed >> # Parent 046a7e828ea63ec940ffae1089a33fae7954da2e >> manifest: make revlog verification optional >> >> This patches adds an parameter to manifestlog.get() to disable hash checking. >> This will be used in an upcoming patch to support treemanifestctx reading >> sub-trees without loading them from the revlog. (This is already supported but >> does not go through the manifestlog.get() code path) >> >> diff --git a/mercurial/manifest.py b/mercurial/manifest.py >> --- a/mercurial/manifest.py >> +++ b/mercurial/manifest.py >> @@ -1278,9 +1278,12 @@ class manifestlog(object): >> """ >> return self.get('', node) >> >> - def get(self, dir, node): >> + def get(self, dir, node, verify=True): >> """Retrieves the manifest instance for the given node. Throws a >> LookupError if not found. >> + >> + `verify` - if True an exception will be thrown if the node is not in >> + the revlog > Patch 2/2 says "Set verify to False since we need to be able to create > subtrees for trees that don't exist on disk yet.", which makes it > sounds like it's only about reading empty revlogs, which should be > safe anyway. What am I missing? "yet" was probably the wrong word there. This logic is to support the test case where the test moves some of the revlogs out of the way and expects the treemanifest to still work because those revlogs are outside the narrow spec. So this verification needs to be turned off so the ctx can be created (and it just so happens that the contents of the ctx are never accessed). I think it's a bit weird that readsubtree() is called for trees outside the narrow spec (which is why we need to handle this case), but that would require a deeper refactoring.
On Mon, Nov 14, 2016 at 4:26 PM, Durham Goode <durham@fb.com> wrote: > On 11/14/16 4:23 PM, Martin von Zweigbergk wrote: >> >> On Mon, Nov 14, 2016 at 3:27 PM, Durham Goode <durham@fb.com> wrote: >>> >>> # HG changeset patch >>> # User Durham Goode <durham@fb.com> >>> # Date 1479165447 28800 >>> # Mon Nov 14 15:17:27 2016 -0800 >>> # Node ID 27209d52a5865422c5ef4ba05cedb28ce32919ed >>> # Parent 046a7e828ea63ec940ffae1089a33fae7954da2e >>> manifest: make revlog verification optional >>> >>> This patches adds an parameter to manifestlog.get() to disable hash >>> checking. >>> This will be used in an upcoming patch to support treemanifestctx reading >>> sub-trees without loading them from the revlog. (This is already >>> supported but >>> does not go through the manifestlog.get() code path) >>> >>> diff --git a/mercurial/manifest.py b/mercurial/manifest.py >>> --- a/mercurial/manifest.py >>> +++ b/mercurial/manifest.py >>> @@ -1278,9 +1278,12 @@ class manifestlog(object): >>> """ >>> return self.get('', node) >>> >>> - def get(self, dir, node): >>> + def get(self, dir, node, verify=True): >>> """Retrieves the manifest instance for the given node. Throws a >>> LookupError if not found. >>> + >>> + `verify` - if True an exception will be thrown if the node is >>> not in >>> + the revlog >> >> Patch 2/2 says "Set verify to False since we need to be able to create >> subtrees for trees that don't exist on disk yet.", which makes it >> sounds like it's only about reading empty revlogs, which should be >> safe anyway. What am I missing? > > "yet" was probably the wrong word there. This logic is to support the test > case where the test moves some of the revlogs out of the way and expects the > treemanifest to still work because those revlogs are outside the narrow > spec. So this verification needs to be turned off so the ctx can be created > (and it just so happens that the contents of the ctx are never accessed). Ah, I see. Another question: When do we want to verify? Tests seem to pass even if I always turn of verification (defaulted it to False). > > I think it's a bit weird that readsubtree() is called for trees outside the > narrow spec (which is why we need to handle this case), but that would > require a deeper refactoring.
On 11/14/16 4:48 PM, Martin von Zweigbergk wrote: > On Mon, Nov 14, 2016 at 4:26 PM, Durham Goode <durham@fb.com> wrote: >> On 11/14/16 4:23 PM, Martin von Zweigbergk wrote: >>> On Mon, Nov 14, 2016 at 3:27 PM, Durham Goode <durham@fb.com> wrote: >>>> # HG changeset patch >>>> # User Durham Goode <durham@fb.com> >>>> # Date 1479165447 28800 >>>> # Mon Nov 14 15:17:27 2016 -0800 >>>> # Node ID 27209d52a5865422c5ef4ba05cedb28ce32919ed >>>> # Parent 046a7e828ea63ec940ffae1089a33fae7954da2e >>>> manifest: make revlog verification optional >>>> >>>> This patches adds an parameter to manifestlog.get() to disable hash >>>> checking. >>>> This will be used in an upcoming patch to support treemanifestctx reading >>>> sub-trees without loading them from the revlog. (This is already >>>> supported but >>>> does not go through the manifestlog.get() code path) >>>> >>>> diff --git a/mercurial/manifest.py b/mercurial/manifest.py >>>> --- a/mercurial/manifest.py >>>> +++ b/mercurial/manifest.py >>>> @@ -1278,9 +1278,12 @@ class manifestlog(object): >>>> """ >>>> return self.get('', node) >>>> >>>> - def get(self, dir, node): >>>> + def get(self, dir, node, verify=True): >>>> """Retrieves the manifest instance for the given node. Throws a >>>> LookupError if not found. >>>> + >>>> + `verify` - if True an exception will be thrown if the node is >>>> not in >>>> + the revlog >>> Patch 2/2 says "Set verify to False since we need to be able to create >>> subtrees for trees that don't exist on disk yet.", which makes it >>> sounds like it's only about reading empty revlogs, which should be >>> safe anyway. What am I missing? >> "yet" was probably the wrong word there. This logic is to support the test >> case where the test moves some of the revlogs out of the way and expects the >> treemanifest to still work because those revlogs are outside the narrow >> spec. So this verification needs to be turned off so the ctx can be created >> (and it just so happens that the contents of the ctx are never accessed). > Ah, I see. Another question: When do we want to verify? Tests seem to > pass even if I always turn of verification (defaulted it to False). > I think we want to verify all the time, since it's good to fail early if someone requests a manifest that doesn't exist. The tests don't fail because the only time verification should fail is if there's a logic error in the code.
On Mon, Nov 14, 2016 at 5:21 PM, Durham Goode <durham@fb.com> wrote: > > > On 11/14/16 4:48 PM, Martin von Zweigbergk wrote: >> >> On Mon, Nov 14, 2016 at 4:26 PM, Durham Goode <durham@fb.com> wrote: >>> >>> On 11/14/16 4:23 PM, Martin von Zweigbergk wrote: >>>> >>>> On Mon, Nov 14, 2016 at 3:27 PM, Durham Goode <durham@fb.com> wrote: >>>>> >>>>> # HG changeset patch >>>>> # User Durham Goode <durham@fb.com> >>>>> # Date 1479165447 28800 >>>>> # Mon Nov 14 15:17:27 2016 -0800 >>>>> # Node ID 27209d52a5865422c5ef4ba05cedb28ce32919ed >>>>> # Parent 046a7e828ea63ec940ffae1089a33fae7954da2e >>>>> manifest: make revlog verification optional >>>>> >>>>> This patches adds an parameter to manifestlog.get() to disable hash >>>>> checking. >>>>> This will be used in an upcoming patch to support treemanifestctx >>>>> reading >>>>> sub-trees without loading them from the revlog. (This is already >>>>> supported but >>>>> does not go through the manifestlog.get() code path) >>>>> >>>>> diff --git a/mercurial/manifest.py b/mercurial/manifest.py >>>>> --- a/mercurial/manifest.py >>>>> +++ b/mercurial/manifest.py >>>>> @@ -1278,9 +1278,12 @@ class manifestlog(object): >>>>> """ >>>>> return self.get('', node) >>>>> >>>>> - def get(self, dir, node): >>>>> + def get(self, dir, node, verify=True): >>>>> """Retrieves the manifest instance for the given node. >>>>> Throws a >>>>> LookupError if not found. >>>>> + >>>>> + `verify` - if True an exception will be thrown if the node is >>>>> not in >>>>> + the revlog >>>> >>>> Patch 2/2 says "Set verify to False since we need to be able to create >>>> subtrees for trees that don't exist on disk yet.", which makes it >>>> sounds like it's only about reading empty revlogs, which should be >>>> safe anyway. What am I missing? >>> >>> "yet" was probably the wrong word there. This logic is to support the >>> test >>> case where the test moves some of the revlogs out of the way and expects >>> the >>> treemanifest to still work because those revlogs are outside the narrow >>> spec. So this verification needs to be turned off so the ctx can be >>> created >>> (and it just so happens that the contents of the ctx are never accessed). >> >> Ah, I see. Another question: When do we want to verify? Tests seem to >> pass even if I always turn of verification (defaulted it to False). >> > I think we want to verify all the time, since it's good to fail early if > someone requests a manifest that doesn't exist. The tests don't fail > because the only time verification should fail is if there's a logic error > in the code. As we talked about on IRC, we could also never verify. If we did, it would fail at .read() time instead. I'm fine with either, so I've queued this. I was initially concerned that it would cause reading of one level of dirlogs too much, but that was before I understood what your call with verify=False was about. I now understand that this is about verifying only the root manifest. Thanks for fixing narrowhg for us!
On Mon, Nov 14, 2016 at 3:27 PM, Durham Goode <durham@fb.com> wrote: > # HG changeset patch > # User Durham Goode <durham@fb.com> > # Date 1479165447 28800 > # Mon Nov 14 15:17:27 2016 -0800 > # Node ID 27209d52a5865422c5ef4ba05cedb28ce32919ed > # Parent 046a7e828ea63ec940ffae1089a33fae7954da2e > manifest: make revlog verification optional > > This patches adds an parameter to manifestlog.get() to disable hash > checking. > This will be used in an upcoming patch to support treemanifestctx reading > sub-trees without loading them from the revlog. (This is already supported > but > does not go through the manifestlog.get() code path) > I could leverage this on the base revlog class because `hg debugupgraderepo` can spend >50% of its CPU time doing SHA-1 verification (most of that when converting manifests - there are tens of gigabytes of raw manifest text that needs to be hashed when converting the revlog). That could be a follow-up, of course. > > diff --git a/mercurial/manifest.py b/mercurial/manifest.py > --- a/mercurial/manifest.py > +++ b/mercurial/manifest.py > @@ -1278,9 +1278,12 @@ class manifestlog(object): > """ > return self.get('', node) > > - def get(self, dir, node): > + def get(self, dir, node, verify=True): > """Retrieves the manifest instance for the given node. Throws a > LookupError if not found. > + > + `verify` - if True an exception will be thrown if the node is not > in > + the revlog > """ > if node in self._dirmancache.get(dir, ()): > cachemf = self._dirmancache[dir][node] > @@ -1292,19 +1295,21 @@ class manifestlog(object): > > if dir: > if self._revlog._treeondisk: > - dirlog = self._revlog.dirlog(dir) > - if node not in dirlog.nodemap: > - raise LookupError(node, dirlog.indexfile, > - _('no node')) > + if verify: > + dirlog = self._revlog.dirlog(dir) > + if node not in dirlog.nodemap: > + raise LookupError(node, dirlog.indexfile, > + _('no node')) > m = treemanifestctx(self._repo, dir, node) > else: > raise error.Abort( > _("cannot ask for manifest directory '%s' in a > flat " > "manifest") % dir) > else: > - if node not in self._revlog.nodemap: > - raise LookupError(node, self._revlog.indexfile, > - _('no node')) > + if verify: > + if node not in self._revlog.nodemap: > + raise LookupError(node, self._revlog.indexfile, > + _('no node')) > if self._treeinmem: > m = treemanifestctx(self._repo, '', node) > else: > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@mercurial-scm.org > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel >
On Tue, Nov 15, 2016 at 9:59 AM, Gregory Szorc <gregory.szorc@gmail.com> wrote: > On Mon, Nov 14, 2016 at 3:27 PM, Durham Goode <durham@fb.com> wrote: >> >> # HG changeset patch >> # User Durham Goode <durham@fb.com> >> # Date 1479165447 28800 >> # Mon Nov 14 15:17:27 2016 -0800 >> # Node ID 27209d52a5865422c5ef4ba05cedb28ce32919ed >> # Parent 046a7e828ea63ec940ffae1089a33fae7954da2e >> manifest: make revlog verification optional >> >> This patches adds an parameter to manifestlog.get() to disable hash >> checking. >> This will be used in an upcoming patch to support treemanifestctx reading >> sub-trees without loading them from the revlog. (This is already supported >> but >> does not go through the manifestlog.get() code path) > > > I could leverage this on the base revlog class because `hg debugupgraderepo` > can spend >50% of its CPU time doing SHA-1 verification (most of that when > converting manifests - there are tens of gigabytes of raw manifest text that > needs to be hashed when converting the revlog). That could be a follow-up, > of course. I suspect you thought the "revlog verification" was about hash verification. So did I when read the subject line. I was confused why that would be relevant, but then I looked at the patch and forgot about that. It's queued already, so it doesn't seem worth updating it at this point. > >> >> >> diff --git a/mercurial/manifest.py b/mercurial/manifest.py >> --- a/mercurial/manifest.py >> +++ b/mercurial/manifest.py >> @@ -1278,9 +1278,12 @@ class manifestlog(object): >> """ >> return self.get('', node) >> >> - def get(self, dir, node): >> + def get(self, dir, node, verify=True): >> """Retrieves the manifest instance for the given node. Throws a >> LookupError if not found. >> + >> + `verify` - if True an exception will be thrown if the node is not >> in >> + the revlog >> """ >> if node in self._dirmancache.get(dir, ()): >> cachemf = self._dirmancache[dir][node] >> @@ -1292,19 +1295,21 @@ class manifestlog(object): >> >> if dir: >> if self._revlog._treeondisk: >> - dirlog = self._revlog.dirlog(dir) >> - if node not in dirlog.nodemap: >> - raise LookupError(node, dirlog.indexfile, >> - _('no node')) >> + if verify: >> + dirlog = self._revlog.dirlog(dir) >> + if node not in dirlog.nodemap: >> + raise LookupError(node, dirlog.indexfile, >> + _('no node')) >> m = treemanifestctx(self._repo, dir, node) >> else: >> raise error.Abort( >> _("cannot ask for manifest directory '%s' in a >> flat " >> "manifest") % dir) >> else: >> - if node not in self._revlog.nodemap: >> - raise LookupError(node, self._revlog.indexfile, >> - _('no node')) >> + if verify: >> + if node not in self._revlog.nodemap: >> + raise LookupError(node, self._revlog.indexfile, >> + _('no node')) >> if self._treeinmem: >> m = treemanifestctx(self._repo, '', node) >> else: >> _______________________________________________ >> Mercurial-devel mailing list >> Mercurial-devel@mercurial-scm.org >> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel > > > > _______________________________________________ > Mercurial-devel mailing list > Mercurial-devel@mercurial-scm.org > https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel >
Patch
diff --git a/mercurial/manifest.py b/mercurial/manifest.py --- a/mercurial/manifest.py +++ b/mercurial/manifest.py @@ -1278,9 +1278,12 @@ class manifestlog(object): """ return self.get('', node) - def get(self, dir, node): + def get(self, dir, node, verify=True): """Retrieves the manifest instance for the given node. Throws a LookupError if not found. + + `verify` - if True an exception will be thrown if the node is not in + the revlog """ if node in self._dirmancache.get(dir, ()): cachemf = self._dirmancache[dir][node] @@ -1292,19 +1295,21 @@ class manifestlog(object): if dir: if self._revlog._treeondisk: - dirlog = self._revlog.dirlog(dir) - if node not in dirlog.nodemap: - raise LookupError(node, dirlog.indexfile, - _('no node')) + if verify: + dirlog = self._revlog.dirlog(dir) + if node not in dirlog.nodemap: + raise LookupError(node, dirlog.indexfile, + _('no node')) m = treemanifestctx(self._repo, dir, node) else: raise error.Abort( _("cannot ask for manifest directory '%s' in a flat " "manifest") % dir) else: - if node not in self._revlog.nodemap: - raise LookupError(node, self._revlog.indexfile, - _('no node')) + if verify: + if node not in self._revlog.nodemap: + raise LookupError(node, self._revlog.indexfile, + _('no node')) if self._treeinmem: m = treemanifestctx(self._repo, '', node) else: