Patchwork D11794: status: use filesystem time boundary to invalidate racy files

login
register
mail settings
Submitter phabricator
Date Nov. 24, 2021, 11:16 a.m.
Message ID <differential-rev-PHID-DREV-mzrql2ltdemm3csdqgfc-req@mercurial-scm.org>
Download mbox | patch
Permalink /patch/50125/
State New
Headers show

Comments

phabricator - Nov. 24, 2021, 11:16 a.m.
marmoute created this revision.
Herald added a reviewer: hg-reviewers.
Herald added a subscriber: mercurial-patches.

REVISION SUMMARY
  We take the filesystem time at the start of the status walk and use that as
  a boundary to detect files that have been modified in the same second as the
  start or after the start. Nanosecond precision will come in a later patch.
  
  To cope with "broken" time on the file system where file could be in the future,
  we also keep mtime for file over one file in the future. See inline comment for
  details.
  
  Large file tests get a bit more confused as we reduce the odds for race condition
  
  The win32text extension is happy again.

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D11794

AFFECTED FILES
  hgext/largefiles/lfutil.py
  hgext/largefiles/overrides.py
  hgext/largefiles/reposetup.py
  hgext/remotefilelog/__init__.py
  mercurial/context.py
  mercurial/dirstate.py
  mercurial/narrowspec.py
  tests/test-dirstate-race.t

CHANGE DETAILS




To: marmoute, #hg-reviewers
Cc: mercurial-patches, mercurial-devel

Patch

diff --git a/tests/test-dirstate-race.t b/tests/test-dirstate-race.t
--- a/tests/test-dirstate-race.t
+++ b/tests/test-dirstate-race.t
@@ -66,11 +66,11 @@ 
   > )
   > def extsetup(ui):
   >     extensions.wrapfunction(context.workingctx, '_checklookup', overridechecklookup)
-  > def overridechecklookup(orig, self, files):
+  > def overridechecklookup(orig, self, *args, **kwargs):
   >     # make an update that changes the dirstate from underneath
   >     self._repo.ui.system(br"sh '$TESTTMP/dirstaterace.sh'",
   >                          cwd=self._repo.root)
-  >     return orig(self, files)
+  >     return orig(self, *args, **kwargs)
   > EOF
 
   $ hg debugrebuilddirstate
diff --git a/mercurial/narrowspec.py b/mercurial/narrowspec.py
--- a/mercurial/narrowspec.py
+++ b/mercurial/narrowspec.py
@@ -323,7 +323,7 @@ 
     removedmatch = matchmod.differencematcher(oldmatch, newmatch)
 
     ds = repo.dirstate
-    lookup, status = ds.status(
+    lookup, status, _mtime_boundary = ds.status(
         removedmatch, subrepos=[], ignored=True, clean=True, unknown=True
     )
     trackeddirty = status.modified + status.added
diff --git a/mercurial/dirstate.py b/mercurial/dirstate.py
--- a/mercurial/dirstate.py
+++ b/mercurial/dirstate.py
@@ -1308,11 +1308,20 @@ 
             # Some matchers have yet to be implemented
             use_rust = False
 
+        # Get the time from the filesystem so we can disambiguate files that
+        # appear modified in the present or future.
+        try:
+            mtime_boundary = timestamp.get_fs_now(self._opener)
+        except OSError:
+            # In largefiles or readonly context
+            mtime_boundary = None
+
         if use_rust:
             try:
-                return self._rust_status(
+                res = self._rust_status(
                     match, listclean, listignored, listunknown
                 )
+                return res + (mtime_boundary,)
             except rustmod.FallbackError:
                 pass
 
@@ -1402,7 +1411,7 @@ 
         status = scmutil.status(
             modified, added, removed, deleted, unknown, ignored, clean
         )
-        return (lookup, status)
+        return (lookup, status, mtime_boundary)
 
     def matches(self, match):
         """
diff --git a/mercurial/context.py b/mercurial/context.py
--- a/mercurial/context.py
+++ b/mercurial/context.py
@@ -1796,13 +1796,14 @@ 
             sane.append(f)
         return sane
 
-    def _checklookup(self, files):
+    def _checklookup(self, files, mtime_boundary):
         # check for any possibly clean files
         if not files:
-            return [], [], []
+            return [], [], [], []
 
         modified = []
         deleted = []
+        clean = []
         fixup = []
         pctx = self._parents[0]
         # do a full compare of any files that might have changed
@@ -1816,21 +1817,34 @@ 
                     or pctx[f].cmp(self[f])
                 ):
                     modified.append(f)
+                elif mtime_boundary is None:
+                    clean.append(f)
                 else:
-                    # XXX not that we have a race windows here since we gather
-                    # the stats after we compared so the file might have
-                    # changed.
-                    #
-                    # However this have been the case because and the
-                    # refactoring moving the code here is improving the
-                    # situation by narrowing the race and moving the two steps
-                    # (comparison + stat) in the same location. So making this
-                    # "correct" is now possible.
                     s = self[f].lstat()
                     mode = s.st_mode
                     size = s.st_size
-                    mtime = timestamp.mtime_of(s)
-                    fixup.append((f, (mode, size, mtime)))
+                    file_mtime = timestamp.mtime_of(s)
+                    cache_info = (mode, size, file_mtime)
+
+                    file_second = file_mtime[0]
+                    boundary_second = mtime_boundary[0]
+                    # If the mtime of the ambiguous file is younger (or equal)
+                    # to the starting point of the `status` walk, we cannot
+                    # garantee that another, racy, write will happens right
+                    # after with the same mtime and we cannot cache the information.
+                    #
+                    # However is the mtime is far away in the future, this is
+                    # likely some mismatch between the current clock and
+                    # previous file system write. So mtime more than one days
+                    # in the future are considered fine.
+                    if (
+                        boundary_second
+                        <= file_second
+                        < (3600 * 24 + boundary_second)
+                    ):
+                        clean.append(f)
+                    else:
+                        fixup.append((f, cache_info))
             except (IOError, OSError):
                 # A file become inaccessible in between? Mark it as deleted,
                 # matching dirstate behavior (issue5584).
@@ -1840,7 +1854,7 @@ 
                 # it's in the dirstate.
                 deleted.append(f)
 
-        return modified, deleted, fixup
+        return modified, deleted, clean, fixup
 
     def _poststatusfixup(self, status, fixup):
         """update dirstate for files that are actually clean"""
@@ -1894,17 +1908,21 @@ 
         subrepos = []
         if b'.hgsub' in self:
             subrepos = sorted(self.substate)
-        cmp, s = self._repo.dirstate.status(
+        cmp, s, mtime_boundary = self._repo.dirstate.status(
             match, subrepos, ignored=ignored, clean=clean, unknown=unknown
         )
 
         # check for any possibly clean files
         fixup = []
         if cmp:
-            modified2, deleted2, fixup = self._checklookup(cmp)
+            modified2, deleted2, clean_set, fixup = self._checklookup(
+                cmp, mtime_boundary
+            )
             s.modified.extend(modified2)
             s.deleted.extend(deleted2)
 
+            if clean_set and clean:
+                s.clean.extend(clean_set)
             if fixup and clean:
                 s.clean.extend((f for f, _ in fixup))
 
diff --git a/hgext/remotefilelog/__init__.py b/hgext/remotefilelog/__init__.py
--- a/hgext/remotefilelog/__init__.py
+++ b/hgext/remotefilelog/__init__.py
@@ -520,7 +520,7 @@ 
 
 
 # Prefetch files before status attempts to look at their size and contents
-def checklookup(orig, self, files):
+def checklookup(orig, self, files, mtime_boundary):
     repo = self._repo
     if isenabled(repo):
         prefetchfiles = []
@@ -530,7 +530,7 @@ 
                     prefetchfiles.append((f, hex(parent.filenode(f))))
         # batch fetch the needed files from the server
         repo.fileservice.prefetch(prefetchfiles)
-    return orig(self, files)
+    return orig(self, files, mtime_boundary)
 
 
 # Prefetch the logic that compares added and removed files for renames
diff --git a/hgext/largefiles/reposetup.py b/hgext/largefiles/reposetup.py
--- a/hgext/largefiles/reposetup.py
+++ b/hgext/largefiles/reposetup.py
@@ -197,7 +197,7 @@ 
                     match._files = [f for f in match._files if sfindirstate(f)]
                     # Don't waste time getting the ignored and unknown
                     # files from lfdirstate
-                    unsure, s = lfdirstate.status(
+                    unsure, s, mtime_boundary = lfdirstate.status(
                         match,
                         subrepos=[],
                         ignored=False,
@@ -230,6 +230,10 @@ 
                                 size = s.st_size
                                 mtime = timestamp.mtime_of(s)
                                 cache_data = (mode, size, mtime)
+                                # We should consider using the mtime_boundary
+                                # logic here, but largefile never actually had
+                                # ambiguity protection before, so this confuse
+                                # the tests and need more thinking.
                                 lfdirstate.set_clean(lfile, cache_data)
                     else:
                         tocheck = unsure + modified + added + clean
diff --git a/hgext/largefiles/overrides.py b/hgext/largefiles/overrides.py
--- a/hgext/largefiles/overrides.py
+++ b/hgext/largefiles/overrides.py
@@ -1519,7 +1519,7 @@ 
         return orig(repo, matcher, prefix, uipathfn, opts)
     # Get the list of missing largefiles so we can remove them
     lfdirstate = lfutil.openlfdirstate(repo.ui, repo)
-    unsure, s = lfdirstate.status(
+    unsure, s, mtime_boundary = lfdirstate.status(
         matchmod.always(),
         subrepos=[],
         ignored=False,
@@ -1746,7 +1746,7 @@ 
         # (*1) deprecated, but used internally (e.g: "rebase --collapse")
 
         lfdirstate = lfutil.openlfdirstate(repo.ui, repo)
-        unsure, s = lfdirstate.status(
+        unsure, s, mtime_boundary = lfdirstate.status(
             matchmod.always(),
             subrepos=[],
             ignored=False,
diff --git a/hgext/largefiles/lfutil.py b/hgext/largefiles/lfutil.py
--- a/hgext/largefiles/lfutil.py
+++ b/hgext/largefiles/lfutil.py
@@ -244,7 +244,7 @@ 
 def lfdirstatestatus(lfdirstate, repo):
     pctx = repo[b'.']
     match = matchmod.always()
-    unsure, s = lfdirstate.status(
+    unsure, s, mtime_boundary = lfdirstate.status(
         match, subrepos=[], ignored=False, clean=False, unknown=False
     )
     modified, clean = s.modified, s.clean
@@ -263,6 +263,8 @@ 
             size = st.st_size
             mtime = timestamp.mtime_of(st)
             cache_data = (mode, size, mtime)
+            # We should consider using the mtime_boundary logic here,
+            # but largefile never actually had ambiguity protection before.
             lfdirstate.set_clean(lfile, cache_data)
     return s
 
@@ -670,7 +672,7 @@ 
         # large.
         lfdirstate = openlfdirstate(ui, repo)
         dirtymatch = matchmod.always()
-        unsure, s = lfdirstate.status(
+        unsure, s, mtime_boundary = lfdirstate.status(
             dirtymatch, subrepos=[], ignored=False, clean=False, unknown=False
         )
         modifiedfiles = unsure + s.modified + s.added + s.removed