Patchwork [1,of,2] fixed compare filelog rev with file content, besides meta-info content

login
register
mail settings
Submitter alexrayne
Date Sept. 25, 2021, 8:15 p.m.
Message ID <66d9efd861e40c31ca99.1632600931@black2>
Download mbox | patch
Permalink /patch/49817/
State Superseded
Headers show

Comments

alexrayne - Sept. 25, 2021, 8:15 p.m.
# HG changeset patch
# User alexrayne <alexraynepe196@gmail.com>
# Date 1632600142 -10800
#      Sat Sep 25 23:02:22 2021 +0300
# Branch stable
# Node ID 66d9efd861e40c31ca99746ff7d7d432b7d8d1f0
# Parent  faeb0ef5079e5faeac35283873aab02c84b5cdb2
fixed compare filelog rev with file content, besides meta-info content.

* if meta-info of files can vary, comarison of such revs text-body come non-trivial.
  status detection of files in WC rely on this comparison. So here provided
  compare that can definitely ignore meta, and give right comparison of text-body.

* filelog.size() now returns rev textbody len, without meta-info

* provided storageutil.filerev_content handle intended for filerev text and meta
        access speedup. filelog caches revs info for size, and meta access

* filelog.meta(node ) provide easy access to rev meta-info with filerev_content
Joerg Sonnenberger - Sept. 25, 2021, 9:42 p.m.
> * filelog.size() now returns rev textbody len, without meta-info

So, have you made sure this doesn't create a huge performance
regression? The whole point of the original logic is to try avoid a full
revision resolution as often as possible.

Joerg

Patch

# HG changeset patch
# User alexrayne <alexraynepe196@gmail.com>
# Date 1632600142 -10800
#      Sat Sep 25 23:02:22 2021 +0300
# Branch stable
# Node ID 66d9efd861e40c31ca99746ff7d7d432b7d8d1f0
# Parent  faeb0ef5079e5faeac35283873aab02c84b5cdb2
fixed compare filelog rev with file content, besides meta-info content.

* if meta-info of files can vary, comarison of such revs text-body come non-trivial.
  status detection of files in WC rely on this comparison. So here provided
  compare that can definitely ignore meta, and give right comparison of text-body.

* filelog.size() now returns rev textbody len, without meta-info

* provided storageutil.filerev_content handle intended for filerev text and meta
        access speedup. filelog caches revs info for size, and meta access

* filelog.meta(node ) provide easy access to rev meta-info with filerev_content

diff --git a/mercurial/context.py b/mercurial/context.py
--- a/mercurial/context.py
+++ b/mercurial/context.py
@@ -980,14 +980,9 @@ 
             )
 
         if fctx._filenode is None:
-            if self._repo._encodefilterpats:
-                # can't rely on size() because wdir content may be decoded
-                return self._filelog.cmp(self._filenode, fctx.data())
-            if self.size() - 4 == fctx.size():
-                # size() can match:
-                # if file data starts with '\1\n', empty metadata block is
-                # prepended, which adds 4 bytes to filelog.size().
-                return self._filelog.cmp(self._filenode, fctx.data())
+            # since fctx have no metadata yet we compare file-content only
+            return self._filelog.cmp(self._filenode, fctx.data())
+            
         if self.size() == fctx.size() or self.flags() == b'l':
             # size() matches: need to compare content
             # issue6456: Always compare symlinks because size can represent
diff --git a/mercurial/filelog.py b/mercurial/filelog.py
--- a/mercurial/filelog.py
+++ b/mercurial/filelog.py
@@ -40,12 +40,26 @@ 
         self.nullid = self._revlog.nullid
         opts = opener.options
         self._fix_issue6528 = opts.get(b'issue6528.fix-incoming', True)
+        self._info_cache = {}
 
     def __len__(self):
         return len(self._revlog)
 
     def __iter__(self):
         return self._revlog.__iter__()
+    
+    def _revinfo(self, node):
+        if node in self._info_cache:
+            return self._info_cache[node]
+        info = storageutil.filerev_content(self._revlog, node)
+        self._info_cache[node] = info
+        return info
+    
+    def _revinfo_invalidate(self, node = None):
+        if node:
+            del self._info_cache[node]
+        else:
+            self._info_cache = {}
 
     def hasnode(self, node):
         if node in (self.nullid, nullrev):
@@ -177,16 +191,21 @@ 
         return self._revlog.getstrippoint(minlink)
 
     def strip(self, minlink, transaction):
+        self._revinfo_invalidate()
         return self._revlog.strip(minlink, transaction)
 
     def censorrevision(self, tr, node, tombstone=b''):
+        self._revinfo_invalidate(node)
         return self._revlog.censorrevision(tr, node, tombstone=tombstone)
 
     def files(self):
         return self._revlog.files()
 
     def read(self, node):
-        return storageutil.filtermetadata(self.revision(node))
+        return self._revinfo(node).text()
+    
+    def meta(self, node):
+        return self._revinfo(node).meta
 
     def add(self, text, meta, transaction, link, p1=None, p2=None):
         if meta or text.startswith(b'\1\n'):
@@ -195,27 +214,22 @@ 
         return self.node(rev)
 
     def renamed(self, node):
-        return storageutil.filerevisioncopied(self, node)
+        return self._revinfo(node).is_copied()
 
     def size(self, rev):
         """return the size of a given revision"""
 
-        # for revisions with renames, we have to go the slow way
-        node = self.node(rev)
-        if self.renamed(node):
-            return len(self.read(node))
         if self.iscensored(rev):
             return 0
 
-        # XXX if self.read(node).startswith("\1\n"), this returns (size+4)
-        return self._revlog.size(rev)
+        return self._revinfo(self.node(rev)).textsize();
 
     def cmp(self, node, text):
         """compare text with a given file revision
 
         returns True if text is different than what is stored.
         """
-        return not storageutil.filedataequivalent(self, node, text)
+        return not self._revinfo(node).filedataequivalent(text)
 
     def verifyintegrity(self, state):
         return self._revlog.verifyintegrity(state)
diff --git a/mercurial/utils/storageutil.py b/mercurial/utils/storageutil.py
--- a/mercurial/utils/storageutil.py
+++ b/mercurial/utils/storageutil.py
@@ -105,6 +105,81 @@ 
     return text[offset + 2 :]
 
 
+class filerev_content(object):
+    node = None
+    meta = None
+    
+    # meta_offset = None
+    meta_len    = 0
+    text_offs   = 0
+    text_len    = 0
+    raw = None
+    
+    def __init__(self, store, nodeorrev):
+        self.store = store
+        if isinstance(nodeorrev, int):
+            self.node = store.node(nodeorrev)
+        else:
+            self.node = nodeorrev
+        
+        self.raw = store.revision(nodeorrev)
+        if self.raw.startswith(b'\x01\n'):
+            offset = self.raw.index(b'\x01\n', 2)
+            self.meta_len   = offset+2
+            self.text_offs  = offset+2
+            self.meta = parsemeta(self.raw)[0]
+        self.text_len = len(self.raw) - self.text_offs
+
+    def text(self):
+        return self.raw[self.text_offs :]
+
+    def textsize(self):
+        return self.text_len
+
+    def issame_text(self, data):
+        return self.raw.endswith(data, self.text_offs)
+    
+    def is_copied(self):
+        if not self.meta:
+            return False
+        return (b'copy' in self.meta and b'copyrev' in self.meta)
+
+    def iscensored_text(self):
+        if self.meta:
+            return ( b'censored' in self.meta )
+        return False 
+    
+    def iscensored_node(self):
+        return self.store.iscensored( self.store.rev(self.node) );
+    
+    def filedataequivalent(self, filedata):
+        """Determines whether file data is equivalent to a stored node.
+    
+        Returns True if the passed file data would hash to the same value
+        as a stored revision and False otherwise.
+    
+        When a stored revision is censored, filedata must be empty to have
+        equivalence.
+    
+        When a stored revision has copy metadata, it is ignored as part
+        of the compare.
+        """
+        if self.text_len == len(filedata):
+            # calculating cache too expensive if we alredy have raw data to compare
+            if self.issame_text(filedata): 
+                return True
+
+        # Censored files compare against the empty file.
+        if self.iscensored_text() or self.iscensored_node():
+            return filedata == b''
+    
+        # Renaming a file produces a different hash, even if the data
+        # remains unchanged. Check if that's the case.
+        if self.is_copied():
+            return self.issame_text(filedata)
+    
+        return False
+
 def filerevisioncopied(store, node):
     """Resolve file revision copy metadata.
 
@@ -126,41 +201,8 @@ 
 
 
 def filedataequivalent(store, node, filedata):
-    """Determines whether file data is equivalent to a stored node.
-
-    Returns True if the passed file data would hash to the same value
-    as a stored revision and False otherwise.
-
-    When a stored revision is censored, filedata must be empty to have
-    equivalence.
-
-    When a stored revision has copy metadata, it is ignored as part
-    of the compare.
-    """
-
-    if filedata.startswith(b'\x01\n'):
-        revisiontext = b'\x01\n\x01\n' + filedata
-    else:
-        revisiontext = filedata
-
-    p1, p2 = store.parents(node)
-
-    computednode = hashrevisionsha1(revisiontext, p1, p2)
-
-    if computednode == node:
-        return True
-
-    # Censored files compare against the empty file.
-    if store.iscensored(store.rev(node)):
-        return filedata == b''
-
-    # Renaming a file produces a different hash, even if the data
-    # remains unchanged. Check if that's the case.
-    if store.renamed(node):
-        return store.read(node) == filedata
-
-    return False
-
+    info = filerev_content(store, node)
+    return info.filedataequivalent(filedata)
 
 def iterrevs(storelen, start=0, stop=None):
     """Iterate over revision numbers in a store."""
@@ -233,7 +275,6 @@ 
 
     raise error.LookupError(fileid, identifier, _(b'no match found'))
 
-
 def resolvestripinfo(minlinkrev, tiprev, headrevs, linkrevfn, parentrevsfn):
     """Resolve information needed to strip revisions.