Patchwork [07,of,10] bytearray: replace array.array with bytearray

login
register
mail settings
Submitter timeless
Date May 12, 2016, 1:23 a.m.
Message ID <0032690e7a7869470063.1463016195@gcc2-power8.osuosl.org>
Download mbox | patch
Permalink /patch/15055/
State Changes Requested
Headers show

Comments

timeless - May 12, 2016, 1:23 a.m.
# HG changeset patch
# User timeless <timeless@mozdev.org>
# Date 1461083470 0
#      Tue Apr 19 16:31:10 2016 +0000
# Node ID 0032690e7a78694700632fe1fed541ab2f2ff3b0
# Parent  d0b1768012385bb1ceacace2f6c32ee28a29f46e
# Available At bb://timeless/mercurial-crew
#              hg pull bb://timeless/mercurial-crew -r 0032690e7a78
bytearray: replace array.array with bytearray

The array.array api changes significantly between py2 and py3.
OTOH, bytearray was backported to py2.6+, and thus is supported
everywhere Mercurial runs, and its interface is fairly consistent
between py2 and py3.

Mercurial just wants an array that manages bytes,
so this fits the bill nicely.
Yuya Nishihara - May 18, 2016, 2:21 p.m.
On Thu, 12 May 2016 01:23:15 +0000, timeless wrote:
> # HG changeset patch
> # User timeless <timeless@mozdev.org>
> # Date 1461083470 0
> #      Tue Apr 19 16:31:10 2016 +0000
> # Node ID 0032690e7a78694700632fe1fed541ab2f2ff3b0
> # Parent  d0b1768012385bb1ceacace2f6c32ee28a29f46e
> # Available At bb://timeless/mercurial-crew
> #              hg pull bb://timeless/mercurial-crew -r 0032690e7a78
> bytearray: replace array.array with bytearray
> 
> The array.array api changes significantly between py2 and py3.
> OTOH, bytearray was backported to py2.6+, and thus is supported
> everywhere Mercurial runs, and its interface is fairly consistent
> between py2 and py3.
> 
> Mercurial just wants an array that manages bytes,
> so this fits the bill nicely.

At first glance, this patch looks good to me. But as you know, bytearray()
requires extra str() calls. So I'll have to review this patch more carefully.

If nobody review this, I'll revisit it later.
Martijn Pieters - May 18, 2016, 4:08 p.m.
On 12 May 2016 at 02:23, timeless <timeless@fmr.im> wrote:
> # HG changeset patch
> # User timeless <timeless@mozdev.org>
> # Date 1461083470 0
> #      Tue Apr 19 16:31:10 2016 +0000
> # Node ID 0032690e7a78694700632fe1fed541ab2f2ff3b0
> # Parent  d0b1768012385bb1ceacace2f6c32ee28a29f46e
> # Available At bb://timeless/mercurial-crew
> #              hg pull bb://timeless/mercurial-crew -r 0032690e7a78
> bytearray: replace array.array with bytearray
>
> The array.array api changes significantly between py2 and py3.
> OTOH, bytearray was backported to py2.6+, and thus is supported
> everywhere Mercurial runs, and its interface is fairly consistent
> between py2 and py3.
>
> Mercurial just wants an array that manages bytes,
> so this fits the bill nicely.
>
> diff -r d0b176801238 -r 0032690e7a78 mercurial/branchmap.py
> --- a/mercurial/branchmap.py    Wed May 11 23:27:40 2016 +0000
> +++ b/mercurial/branchmap.py    Tue Apr 19 16:31:10 2016 +0000
> @@ -7,7 +7,6 @@
>
>  from __future__ import absolute_import
>
> -import array
>  import struct
>  import time
>
> @@ -23,7 +22,6 @@
>      scmutil,
>  )
>
> -array = array.array
>  calcsize = struct.calcsize
>  pack = struct.pack
>  unpack = struct.unpack
> @@ -357,7 +355,7 @@
>          assert repo.filtername is None
>          self._repo = repo
>          self._names = [] # branch names in local encoding with static index
> -        self._rbcrevs = array('c') # structs of type _rbcrecfmt
> +        self._rbcrevs = bytearray() # structs of type _rbcrecfmt
>          self._rbcsnameslen = 0
>          try:
>              bndata = repo.vfs.read(_rbcnames)
> @@ -371,7 +369,7 @@
>          if self._names:
>              try:
>                  data = repo.vfs.read(_rbcrevs)
> -                self._rbcrevs.fromstring(data)
> +                self._rbcrevs = bytearray(data)
>              except (IOError, OSError) as inst:
>                  repo.ui.debug("couldn't read revision branch cache: %s\n" %
>                                inst)
> @@ -389,8 +387,7 @@
>          self._rbcnamescount = 0
>          self._namesreverse.clear()
>          self._rbcrevslen = len(self._repo.changelog)
> -        self._rbcrevs = array('c')
> -        self._rbcrevs.fromstring('\0' * (self._rbcrevslen * _rbcrecsize))
> +        self._rbcrevs = bytearray('\0' * (self._rbcrevslen * _rbcrecsize))

Just pass in the integer result:

  self._rbcrevs = bytearray(self._rbcrevslen * _rbcrecsize)

This produces a bytearray of that size, filled with null bytes.

>      def branchinfo(self, rev):
>          """Return branch name and close flag for rev, using and updating
> @@ -450,8 +447,7 @@
>      def _setcachedata(self, rev, node, branchidx):
>          """Writes the node's branch data to the in-memory cache data."""
>          rbcrevidx = rev * _rbcrecsize
> -        rec = array('c')
> -        rec.fromstring(pack(_rbcrecfmt, node, branchidx))
> +        rec = bytearray(pack(_rbcrecfmt, node, branchidx))
>          self._rbcrevs[rbcrevidx:rbcrevidx + _rbcrecsize] = rec
>          self._rbcrevslen = min(self._rbcrevslen, rev)
>
> diff -r d0b176801238 -r 0032690e7a78 mercurial/exchange.py
> --- a/mercurial/exchange.py     Wed May 11 23:27:40 2016 +0000
> +++ b/mercurial/exchange.py     Tue Apr 19 16:31:10 2016 +0000
> @@ -1638,7 +1638,7 @@
>              chunks.extend([node, fnode])
>
>      if chunks:
> -        bundler.newpart('hgtagsfnodes', data=''.join(chunks))
> +        bundler.newpart('hgtagsfnodes', data=''.join([str(a) for a in chunks]))

Would starting chunks as a bytearray, then extending *that* instead of
a list, not work better? That would avoid creating a series of
intermediate strings here.

    chunks = bytearray()

    for node in outgoing.missingheads:
        # Don't compute missing, as this may slow down serving.
        fnode = cache.getfnode(node, computemissing=False)
        if fnode is not None:
            chunks += node
            chunks += fnode

Now you have just one bytestring object to convert to str.

>  def check_heads(repo, their_heads, context):
>      """check if the heads of a repo have been modified
> diff -r d0b176801238 -r 0032690e7a78 mercurial/manifest.py
> --- a/mercurial/manifest.py     Wed May 11 23:27:40 2016 +0000
> +++ b/mercurial/manifest.py     Tue Apr 19 16:31:10 2016 +0000
> @@ -7,7 +7,6 @@
>
>  from __future__ import absolute_import
>
> -import array
>  import heapq
>  import os
>  import struct
> @@ -338,7 +337,7 @@
>              return self._lm.text()
>
>      def fastdelta(self, base, changes):
> -        """Given a base manifest text as an array.array and a list of changes
> +        """Given a base manifest text as a bytearray and a list of changes
>          relative to that text, compute a delta that can be used by revlog.
>          """
>          delta = []
> @@ -384,8 +383,8 @@
>          else:
>              # For large changes, it's much cheaper to just build the text and
>              # diff it.
> -            arraytext = array.array('c', self.text())
> -            deltatext = mdiff.textdiff(base, arraytext)
> +            arraytext = bytearray(self.text())
> +            deltatext = mdiff.textdiff(str(base), str(arraytext))
>
>          return arraytext, deltatext

If self.text() is a string already, why convert to a bytearray then
back to str? Better use

    text = self.text()
    arraytext = bytearray(text)
    deltatext = mdiff.textdiff(str(base), text)

> @@ -443,12 +442,12 @@
>      # for large addlist arrays, building a new array is cheaper
>      # than repeatedly modifying the existing one
>      currentposition = 0
> -    newaddlist = array.array('c')
> +    newaddlist = bytearray()
>
>      for start, end, content in x:
>          newaddlist += addlist[currentposition:start]
>          if content:
> -            newaddlist += array.array('c', content)
> +            newaddlist += bytearray(content)
>
>          currentposition = end
>
> @@ -1015,7 +1014,7 @@
>          else:
>              text = self.revision(node)
>              m = self._newmanifest(text)
> -            arraytext = array.array('c', text)
> +            arraytext = bytearray(text)
>          self._mancache[node] = (m, arraytext)
>          return m
>
> @@ -1065,7 +1064,7 @@
>              else:
>                  text = m.text(self._usemanifestv2)
>                  n = self.addrevision(text, transaction, link, p1, p2)
> -                arraytext = array.array('c', text)
> +                arraytext = bytearray(text)
>
>          self._mancache[n] = (m, arraytext)
>
> diff -r d0b176801238 -r 0032690e7a78 mercurial/revlog.py
> --- a/mercurial/revlog.py       Wed May 11 23:27:40 2016 +0000
> +++ b/mercurial/revlog.py       Tue Apr 19 16:31:10 2016 +0000
> @@ -352,7 +352,7 @@
>
>      def rev(self, node):
>          try:
> -            return self._nodecache[node]
> +            return self._nodecache[str(node)]
>          except TypeError:
>              raise
>          except RevlogError:
> diff -r d0b176801238 -r 0032690e7a78 mercurial/tags.py
> --- a/mercurial/tags.py Wed May 11 23:27:40 2016 +0000
> +++ b/mercurial/tags.py Tue Apr 19 16:31:10 2016 +0000
> @@ -12,7 +12,6 @@
>
>  from __future__ import absolute_import
>
> -import array
>  import errno
>  import time
>
> @@ -28,8 +27,6 @@
>      util,
>  )
>
> -array = array.array
> -
>  # Tags computation can be expensive and caches exist to make it fast in
>  # the common case.
>  #
> @@ -113,8 +110,8 @@
>                 "tag cache returned bogus head %s" % short(head)
>
>          fnode = tagfnode.get(head)
> -        if fnode and fnode not in seen:
> -            seen.add(fnode)
> +        if fnode and str(fnode) not in seen:
> +            seen.add(str(fnode))
>              if not fctx:
>                  fctx = repo.filectx('.hgtags', fileid=fnode)
>              else:
> @@ -430,13 +427,12 @@
>          self.lookupcount = 0
>          self.hitcount = 0
>
> -        self._raw = array('c')
> -
>          try:
>              data = repo.vfs.read(_fnodescachefile)
>          except (OSError, IOError):
>              data = ""
> -        self._raw.fromstring(data)
> +        data = repo.vfs.tryread(_fnodescachefile)
> +        self._raw = bytearray(data)

data is set twice now; you didn't remove the try..except.

>          # The end state of self._raw is an array that is of the exact length
>          # required to hold a record for every revision in the repository.
> @@ -477,7 +473,7 @@
>          self.lookupcount += 1
>
>          offset = rev * _fnodesrecsize
> -        record = self._raw[offset:offset + _fnodesrecsize].tostring()
> +        record = self._raw[offset:offset + _fnodesrecsize]
>          properprefix = node[0:4]
>
>          # Validate and return existing entry.
> @@ -518,7 +514,7 @@
>
>      def _writeentry(self, offset, prefix, fnode):
>          # Slices on array instances only accept other array.
> -        entry = array('c', prefix + fnode)
> +        entry = bytearray(prefix + fnode)
>          self._raw[offset:offset + _fnodesrecsize] = entry
>          # self._dirtyoffset could be None.
>          self._dirtyoffset = min(self._dirtyoffset, offset) or 0
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
timeless - May 18, 2016, 7:03 p.m.
On Wed, May 18, 2016 at 12:08 PM, Martijn Pieters <mj@zopatista.com> wrote:
> Just pass in the integer result:
>   self._rbcrevs = bytearray(self._rbcrevslen * _rbcrecsize)
> This produces a bytearray of that size, filled with null bytes.

will do
> Would starting chunks as a bytearray, then extending *that* instead of
> a list, not work better? That would avoid creating a series of
> intermediate strings here.
>
>     chunks = bytearray()
>
>     for node in outgoing.missingheads:
>         # Don't compute missing, as this may slow down serving.
>         fnode = cache.getfnode(node, computemissing=False)
>         if fnode is not None:
>             chunks += node
>             chunks += fnode
>
> Now you have just one bytestring object to convert to str.

It definitely works. I'll include it with a v2.

>> -            deltatext = mdiff.textdiff(base, arraytext)
>> +            arraytext = bytearray(self.text())
>> +            deltatext = mdiff.textdiff(str(base), str(arraytext))
>>
>>          return arraytext, deltatext
>
> If self.text() is a string already, why convert to a bytearray then
> back to str?

I'll add a comment. The short answer is that mdiff.textdiff (~ bdiff)
currently takes a string,
but it shouldn't. It should take a bytearray.

The goal of this commit is to get rid of array.array. In a separate
commit, I'll convert bdiff to
accept bytearrays.

>> @@ -430,13 +427,12 @@
>>          try:
>>              data = repo.vfs.read(_fnodescachefile)
>>          except (OSError, IOError):
>>              data = ""
>> -        self._raw.fromstring(data)
>> +        data = repo.vfs.tryread(_fnodescachefile)
>> +        self._raw = bytearray(data)
>
> data is set twice now; you didn't remove the try..except.

Oops. That was a mis-merge. I didn't check the history when I did the merge.
I'll remove the first insertion.

Patch

diff -r d0b176801238 -r 0032690e7a78 mercurial/branchmap.py
--- a/mercurial/branchmap.py	Wed May 11 23:27:40 2016 +0000
+++ b/mercurial/branchmap.py	Tue Apr 19 16:31:10 2016 +0000
@@ -7,7 +7,6 @@ 
 
 from __future__ import absolute_import
 
-import array
 import struct
 import time
 
@@ -23,7 +22,6 @@ 
     scmutil,
 )
 
-array = array.array
 calcsize = struct.calcsize
 pack = struct.pack
 unpack = struct.unpack
@@ -357,7 +355,7 @@ 
         assert repo.filtername is None
         self._repo = repo
         self._names = [] # branch names in local encoding with static index
-        self._rbcrevs = array('c') # structs of type _rbcrecfmt
+        self._rbcrevs = bytearray() # structs of type _rbcrecfmt
         self._rbcsnameslen = 0
         try:
             bndata = repo.vfs.read(_rbcnames)
@@ -371,7 +369,7 @@ 
         if self._names:
             try:
                 data = repo.vfs.read(_rbcrevs)
-                self._rbcrevs.fromstring(data)
+                self._rbcrevs = bytearray(data)
             except (IOError, OSError) as inst:
                 repo.ui.debug("couldn't read revision branch cache: %s\n" %
                               inst)
@@ -389,8 +387,7 @@ 
         self._rbcnamescount = 0
         self._namesreverse.clear()
         self._rbcrevslen = len(self._repo.changelog)
-        self._rbcrevs = array('c')
-        self._rbcrevs.fromstring('\0' * (self._rbcrevslen * _rbcrecsize))
+        self._rbcrevs = bytearray('\0' * (self._rbcrevslen * _rbcrecsize))
 
     def branchinfo(self, rev):
         """Return branch name and close flag for rev, using and updating
@@ -450,8 +447,7 @@ 
     def _setcachedata(self, rev, node, branchidx):
         """Writes the node's branch data to the in-memory cache data."""
         rbcrevidx = rev * _rbcrecsize
-        rec = array('c')
-        rec.fromstring(pack(_rbcrecfmt, node, branchidx))
+        rec = bytearray(pack(_rbcrecfmt, node, branchidx))
         self._rbcrevs[rbcrevidx:rbcrevidx + _rbcrecsize] = rec
         self._rbcrevslen = min(self._rbcrevslen, rev)
 
diff -r d0b176801238 -r 0032690e7a78 mercurial/exchange.py
--- a/mercurial/exchange.py	Wed May 11 23:27:40 2016 +0000
+++ b/mercurial/exchange.py	Tue Apr 19 16:31:10 2016 +0000
@@ -1638,7 +1638,7 @@ 
             chunks.extend([node, fnode])
 
     if chunks:
-        bundler.newpart('hgtagsfnodes', data=''.join(chunks))
+        bundler.newpart('hgtagsfnodes', data=''.join([str(a) for a in chunks]))
 
 def check_heads(repo, their_heads, context):
     """check if the heads of a repo have been modified
diff -r d0b176801238 -r 0032690e7a78 mercurial/manifest.py
--- a/mercurial/manifest.py	Wed May 11 23:27:40 2016 +0000
+++ b/mercurial/manifest.py	Tue Apr 19 16:31:10 2016 +0000
@@ -7,7 +7,6 @@ 
 
 from __future__ import absolute_import
 
-import array
 import heapq
 import os
 import struct
@@ -338,7 +337,7 @@ 
             return self._lm.text()
 
     def fastdelta(self, base, changes):
-        """Given a base manifest text as an array.array and a list of changes
+        """Given a base manifest text as a bytearray and a list of changes
         relative to that text, compute a delta that can be used by revlog.
         """
         delta = []
@@ -384,8 +383,8 @@ 
         else:
             # For large changes, it's much cheaper to just build the text and
             # diff it.
-            arraytext = array.array('c', self.text())
-            deltatext = mdiff.textdiff(base, arraytext)
+            arraytext = bytearray(self.text())
+            deltatext = mdiff.textdiff(str(base), str(arraytext))
 
         return arraytext, deltatext
 
@@ -443,12 +442,12 @@ 
     # for large addlist arrays, building a new array is cheaper
     # than repeatedly modifying the existing one
     currentposition = 0
-    newaddlist = array.array('c')
+    newaddlist = bytearray()
 
     for start, end, content in x:
         newaddlist += addlist[currentposition:start]
         if content:
-            newaddlist += array.array('c', content)
+            newaddlist += bytearray(content)
 
         currentposition = end
 
@@ -1015,7 +1014,7 @@ 
         else:
             text = self.revision(node)
             m = self._newmanifest(text)
-            arraytext = array.array('c', text)
+            arraytext = bytearray(text)
         self._mancache[node] = (m, arraytext)
         return m
 
@@ -1065,7 +1064,7 @@ 
             else:
                 text = m.text(self._usemanifestv2)
                 n = self.addrevision(text, transaction, link, p1, p2)
-                arraytext = array.array('c', text)
+                arraytext = bytearray(text)
 
         self._mancache[n] = (m, arraytext)
 
diff -r d0b176801238 -r 0032690e7a78 mercurial/revlog.py
--- a/mercurial/revlog.py	Wed May 11 23:27:40 2016 +0000
+++ b/mercurial/revlog.py	Tue Apr 19 16:31:10 2016 +0000
@@ -352,7 +352,7 @@ 
 
     def rev(self, node):
         try:
-            return self._nodecache[node]
+            return self._nodecache[str(node)]
         except TypeError:
             raise
         except RevlogError:
diff -r d0b176801238 -r 0032690e7a78 mercurial/tags.py
--- a/mercurial/tags.py	Wed May 11 23:27:40 2016 +0000
+++ b/mercurial/tags.py	Tue Apr 19 16:31:10 2016 +0000
@@ -12,7 +12,6 @@ 
 
 from __future__ import absolute_import
 
-import array
 import errno
 import time
 
@@ -28,8 +27,6 @@ 
     util,
 )
 
-array = array.array
-
 # Tags computation can be expensive and caches exist to make it fast in
 # the common case.
 #
@@ -113,8 +110,8 @@ 
                "tag cache returned bogus head %s" % short(head)
 
         fnode = tagfnode.get(head)
-        if fnode and fnode not in seen:
-            seen.add(fnode)
+        if fnode and str(fnode) not in seen:
+            seen.add(str(fnode))
             if not fctx:
                 fctx = repo.filectx('.hgtags', fileid=fnode)
             else:
@@ -430,13 +427,12 @@ 
         self.lookupcount = 0
         self.hitcount = 0
 
-        self._raw = array('c')
-
         try:
             data = repo.vfs.read(_fnodescachefile)
         except (OSError, IOError):
             data = ""
-        self._raw.fromstring(data)
+        data = repo.vfs.tryread(_fnodescachefile)
+        self._raw = bytearray(data)
 
         # The end state of self._raw is an array that is of the exact length
         # required to hold a record for every revision in the repository.
@@ -477,7 +473,7 @@ 
         self.lookupcount += 1
 
         offset = rev * _fnodesrecsize
-        record = self._raw[offset:offset + _fnodesrecsize].tostring()
+        record = self._raw[offset:offset + _fnodesrecsize]
         properprefix = node[0:4]
 
         # Validate and return existing entry.
@@ -518,7 +514,7 @@ 
 
     def _writeentry(self, offset, prefix, fnode):
         # Slices on array instances only accept other array.
-        entry = array('c', prefix + fnode)
+        entry = bytearray(prefix + fnode)
         self._raw[offset:offset + _fnodesrecsize] = entry
         # self._dirtyoffset could be None.
         self._dirtyoffset = min(self._dirtyoffset, offset) or 0