Patchwork D11248: store: return just one filename in walk functions

login
register
mail settings
Submitter phabricator
Date Aug. 4, 2021, 8:43 p.m.
Message ID <differential-rev-PHID-DREV-ixici5elkqmixkkvdwvq-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/49566/
State Superseded
Headers show

Comments

phabricator - Aug. 4, 2021, 8:43 p.m.
valentin.gatienbaron created this revision.
Herald added a reviewer: durin42.
Herald added a reviewer: martinvonz.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  Various walk functions return `(revlog_type, decoded, encoded)` where
  decoded could be None. But no-one cares about `encoded` and expects
  `unencoded` to be present, except verify (because this can only happen
  with old repo formats).
  
  Simplify all this by either failing outright if a decoding a filename
  fails (instead of almost certainly failing with a type error due to
  treating None as a bytes), or skipping the filename but providing in
  an out argument for hg verify.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11248

AFFECTED FILES
  hgext/narrow/narrowcommands.py
  hgext/remotefilelog/contentstore.py
  hgext/remotefilelog/remotefilelogserver.py
  mercurial/repair.py
  mercurial/store.py
  mercurial/streamclone.py
  mercurial/upgrade_utils/engine.py
  mercurial/verify.py
  mercurial/wireprotov2server.py
  tests/simplestorerepo.py

CHANGE DETAILS




To: valentin.gatienbaron, durin42, martinvonz, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/tests/simplestorerepo.py b/tests/simplestorerepo.py
--- a/tests/simplestorerepo.py
+++ b/tests/simplestorerepo.py
@@ -665,20 +665,24 @@ 
 
 
 class simplestore(store.encodedstore):
-    def datafiles(self):
+    def datafiles(self, undecodable=None):
         for x in super(simplestore, self).datafiles():
             yield x
 
         # Supplement with non-revlog files.
         extrafiles = self._walk('data', True, filefilter=issimplestorefile)
 
-        for unencoded, encoded, size in extrafiles:
+        for f1, size in extrafiles:
             try:
-                unencoded = store.decodefilename(unencoded)
+                f2 = store.decodefilename(f1)
             except KeyError:
-                unencoded = None
-
-            yield unencoded, encoded, size
+                if undecodable is None:
+                    raise error.StorageError(b'undecodable revlog name %s' % a1)
+                else:
+                    undecodable.append(a1)
+                    continue
+
+            yield f2, size
 
 
 def reposetup(ui, repo):
diff --git a/mercurial/wireprotov2server.py b/mercurial/wireprotov2server.py
--- a/mercurial/wireprotov2server.py
+++ b/mercurial/wireprotov2server.py
@@ -1579,7 +1579,7 @@ 
 
     # TODO this is a bunch of storage layer interface abstractions because
     # it assumes revlogs.
-    for rl_type, name, encodedname, size in topfiles:
+    for rl_type, name, size in topfiles:
         # XXX use the `rl_type` for that
         if b'changelog' in files and name.startswith(b'00changelog'):
             pass
diff --git a/mercurial/verify.py b/mercurial/verify.py
--- a/mercurial/verify.py
+++ b/mercurial/verify.py
@@ -395,12 +395,13 @@ 
             storefiles = set()
             subdirs = set()
             revlogv1 = self.revlogv1
-            for t, f, f2, size in repo.store.datafiles():
-                if not f:
-                    self._err(None, _(b"cannot decode filename '%s'") % f2)
-                elif (size > 0 or not revlogv1) and f.startswith(b'meta/'):
+            undecodable = []
+            for t, f, size in repo.store.datafiles(undecodable=undecodable):
+                if (size > 0 or not revlogv1) and f.startswith(b'meta/'):
                     storefiles.add(_normpath(f))
                     subdirs.add(os.path.dirname(f))
+            for f in undecodable:
+                self._err(None, _(b"cannot decode filename '%s'") % f)
             subdirprogress = ui.makeprogress(
                 _(b'checking'), unit=_(b'manifests'), total=len(subdirs)
             )
@@ -459,11 +460,12 @@ 
         ui.status(_(b"checking files\n"))
 
         storefiles = set()
-        for rl_type, f, f2, size in repo.store.datafiles():
-            if not f:
-                self._err(None, _(b"cannot decode filename '%s'") % f2)
-            elif (size > 0 or not revlogv1) and f.startswith(b'data/'):
+        undecodable = []
+        for t, f, size in repo.store.datafiles(undecodable=undecodable):
+            if (size > 0 or not revlogv1) and f.startswith(b'data/'):
                 storefiles.add(_normpath(f))
+        for f in undecodable:
+            self._err(None, _(b"cannot decode filename '%s'") % f)
 
         state = {
             # TODO this assumes revlog storage for changelog.
diff --git a/mercurial/upgrade_utils/engine.py b/mercurial/upgrade_utils/engine.py
--- a/mercurial/upgrade_utils/engine.py
+++ b/mercurial/upgrade_utils/engine.py
@@ -201,7 +201,7 @@ 
 
     # Perform a pass to collect metadata. This validates we can open all
     # source files and allows a unified progress bar to be displayed.
-    for rl_type, unencoded, encoded, size in alldatafiles:
+    for rl_type, unencoded, size in alldatafiles:
         if not rl_type & store.FILEFLAGS_REVLOG_MAIN:
             continue
 
diff --git a/mercurial/streamclone.py b/mercurial/streamclone.py
--- a/mercurial/streamclone.py
+++ b/mercurial/streamclone.py
@@ -248,7 +248,7 @@ 
     # Get consistent snapshot of repo, lock during scan.
     with repo.lock():
         repo.ui.debug(b'scanning\n')
-        for file_type, name, ename, size in _walkstreamfiles(repo):
+        for file_type, name, size in _walkstreamfiles(repo):
             if size:
                 entries.append((name, size))
                 total_bytes += size
@@ -640,7 +640,7 @@ 
     if includes or excludes:
         matcher = narrowspec.match(repo.root, includes, excludes)
 
-    for rl_type, name, ename, size in _walkstreamfiles(repo, matcher):
+    for rl_type, name, size in _walkstreamfiles(repo, matcher):
         if size:
             ft = _fileappend
             if rl_type & store.FILEFLAGS_VOLATILE:
diff --git a/mercurial/store.py b/mercurial/store.py
--- a/mercurial/store.py
+++ b/mercurial/store.py
@@ -472,7 +472,7 @@ 
         return self.path + b'/' + encodedir(f)
 
     def _walk(self, relpath, recurse):
-        '''yields (unencoded, encoded, size)'''
+        '''yields (revlog_type, unencoded, size)'''
         path = self.path
         if relpath:
             path += b'/' + relpath
@@ -488,7 +488,7 @@ 
                     rl_type = is_revlog(f, kind, st)
                     if rl_type is not None:
                         n = util.pconvert(fp[striplen:])
-                        l.append((rl_type, decodedir(n), n, st.st_size))
+                        l.append((rl_type, decodedir(n), st.st_size))
                     elif kind == stat.S_IFDIR and recurse:
                         visit.append(fp)
         l.sort()
@@ -505,26 +505,32 @@ 
         rootstore = manifest.manifestrevlog(repo.nodeconstants, self.vfs)
         return manifest.manifestlog(self.vfs, repo, rootstore, storenarrowmatch)
 
-    def datafiles(self, matcher=None):
+    def datafiles(self, matcher=None, undecodable=None):
+        """Like walk, but excluding the changelog and root manifest.
+
+        When [undecodable] is None, revlogs names that can't be
+        decoded cause an exception. When it is provided, it should
+        be a list and the filenames that can't be decoded are added
+        to it instead. This is very rarely needed."""
         files = self._walk(b'data', True) + self._walk(b'meta', True)
-        for (t, u, e, s) in files:
-            yield (FILEFLAGS_FILELOG | t, u, e, s)
+        for (t, u, s) in files:
+            yield (FILEFLAGS_FILELOG | t, u, s)
 
     def topfiles(self):
         # yield manifest before changelog
         files = reversed(self._walk(b'', False))
-        for (t, u, e, s) in files:
+        for (t, u, s) in files:
             if u.startswith(b'00changelog'):
-                yield (FILEFLAGS_CHANGELOG | t, u, e, s)
+                yield (FILEFLAGS_CHANGELOG | t, u, s)
             elif u.startswith(b'00manifest'):
-                yield (FILEFLAGS_MANIFESTLOG | t, u, e, s)
+                yield (FILEFLAGS_MANIFESTLOG | t, u, s)
             else:
-                yield (FILETYPE_OTHER | t, u, e, s)
+                yield (FILETYPE_OTHER | t, u, s)
 
     def walk(self, matcher=None):
         """return file related to data storage (ie: revlogs)
 
-        yields (file_type, unencoded, encoded, size)
+        yields (file_type, unencoded, size)
 
         if a matcher is passed, storage files of only those tracked paths
         are passed with matches the matcher
@@ -569,15 +575,19 @@ 
         self.vfs = vfsmod.filtervfs(vfs, encodefilename)
         self.opener = self.vfs
 
-    def datafiles(self, matcher=None):
-        for t, a, b, size in super(encodedstore, self).datafiles():
+    def datafiles(self, matcher=None, undecodable=None):
+        for t, f1, size in super(encodedstore, self).datafiles():
             try:
-                a = decodefilename(a)
+                f2 = decodefilename(f1)
             except KeyError:
-                a = None
-            if a is not None and not _matchtrackedpath(a, matcher):
+                if undecodable is None:
+                    raise error.StorageError(b'undecodable revlog name %s' % f1)
+                else:
+                    undecodable.append(f1)
+                    continue
+            if not _matchtrackedpath(f2, matcher):
                 continue
-            yield t, a, b, size
+            yield t, f2, size
 
     def join(self, f):
         return self.path + b'/' + encodefilename(f)
@@ -765,7 +775,7 @@ 
     def getsize(self, path):
         return self.rawvfs.stat(path).st_size
 
-    def datafiles(self, matcher=None):
+    def datafiles(self, matcher=None, undecodable=None):
         for f in sorted(self.fncache):
             if not _matchtrackedpath(f, matcher):
                 continue
@@ -774,7 +784,7 @@ 
                 t = revlog_type(f)
                 assert t is not None, f
                 t |= FILEFLAGS_FILELOG
-                yield t, f, ef, self.getsize(ef)
+                yield t, f, self.getsize(ef)
             except OSError as err:
                 if err.errno != errno.ENOENT:
                     raise
diff --git a/mercurial/repair.py b/mercurial/repair.py
--- a/mercurial/repair.py
+++ b/mercurial/repair.py
@@ -433,7 +433,7 @@ 
     if scmutil.istreemanifest(repo):
         # This logic is safe if treemanifest isn't enabled, but also
         # pointless, so we skip it if treemanifest isn't enabled.
-        for t, unencoded, encoded, size in repo.store.datafiles():
+        for t, unencoded, size in repo.store.datafiles():
             if unencoded.startswith(b'meta/') and unencoded.endswith(
                 b'00manifest.i'
             ):
diff --git a/hgext/remotefilelog/remotefilelogserver.py b/hgext/remotefilelog/remotefilelogserver.py
--- a/hgext/remotefilelog/remotefilelogserver.py
+++ b/hgext/remotefilelog/remotefilelogserver.py
@@ -166,24 +166,24 @@ 
                                 n = util.pconvert(fp[striplen:])
                                 d = store.decodedir(n)
                                 t = store.FILETYPE_OTHER
-                                yield (t, d, n, st.st_size)
+                                yield (t, d, st.st_size)
                         if kind == stat.S_IFDIR:
                             visit.append(fp)
 
             if scmutil.istreemanifest(repo):
-                for (t, u, e, s) in repo.store.datafiles():
+                for (t, u, s) in repo.store.datafiles():
                     if u.startswith(b'meta/') and (
                         u.endswith(b'.i') or u.endswith(b'.d')
                     ):
-                        yield (t, u, e, s)
+                        yield (t, u, s)
 
             # Return .d and .i files that do not match the shallow pattern
             match = state.match
             if match and not match.always():
-                for (t, u, e, s) in repo.store.datafiles():
+                for (t, u, s) in repo.store.datafiles():
                     f = u[5:-2]  # trim data/...  and .i/.d
                     if not state.match(f):
-                        yield (t, u, e, s)
+                        yield (t, u, s)
 
             for x in repo.store.topfiles():
                 if state.noflatmf and x[1][:11] == b'00manifest.':
diff --git a/hgext/remotefilelog/contentstore.py b/hgext/remotefilelog/contentstore.py
--- a/hgext/remotefilelog/contentstore.py
+++ b/hgext/remotefilelog/contentstore.py
@@ -378,7 +378,7 @@ 
             ledger.markdataentry(self, treename, node)
             ledger.markhistoryentry(self, treename, node)
 
-        for t, path, encoded, size in self._store.datafiles():
+        for t, path, size in self._store.datafiles():
             if path[:5] != b'meta/' or path[-2:] != b'.i':
                 continue
 
diff --git a/hgext/narrow/narrowcommands.py b/hgext/narrow/narrowcommands.py
--- a/hgext/narrow/narrowcommands.py
+++ b/hgext/narrow/narrowcommands.py
@@ -289,7 +289,7 @@ 
                 repair.strip(ui, unfi, tostrip, topic=b'narrow', backup=backup)
 
         todelete = []
-        for t, f, f2, size in repo.store.datafiles():
+        for t, f, size in repo.store.datafiles():
             if f.startswith(b'data/'):
                 file = f[5:-2]
                 if not newmatch(file):