Patchwork util.h: getbe32() and putbe32() use intrinsic function to improve performance

login
register
mail settings
Submitter elson.wei@gmail.com
Date Sept. 14, 2013, 3:23 a.m.
Message ID <5e59af7da42418a2d79a.1379129016@ElsonWei-NB.PrimeVOLT>
Download mbox | patch
Permalink /patch/2500/
State Accepted
Commit 06badf7d10dc197cdcbeab6eb0c18d1957eebbe0
Headers show

Comments

elson.wei@gmail.com - Sept. 14, 2013, 3:23 a.m.
# HG changeset patch
# User Wei, Elson <elson.wei@gmail.com>
# Date 1379128954 -28800
#      Sat Sep 14 11:22:34 2013 +0800
# Node ID 5e59af7da42418a2d79ad8720d4eb9725851112b
# Parent  f3beb092c0285cd118c4af05688495f05aca8f3a
util.h: getbe32() and putbe32() use intrinsic function to improve performance

Refer to https://bugzilla.mozilla.org/show_bug.cgi?id=351557. It can improve
byte-swapping performance by intrinsic function.
Siddharth Agarwal - Sept. 14, 2013, 11:48 p.m.
On 09/13/2013 08:23 PM, elson.wei@gmail.com wrote:
> # HG changeset patch
> # User Wei, Elson <elson.wei@gmail.com>
> # Date 1379128954 -28800
> #      Sat Sep 14 11:22:34 2013 +0800
> # Node ID 5e59af7da42418a2d79ad8720d4eb9725851112b
> # Parent  f3beb092c0285cd118c4af05688495f05aca8f3a
> util.h: getbe32() and putbe32() use intrinsic function to improve performance

Have you found an actual performance difference, maybe on MSVC? I tried 
(with the fix below) but didn't find anything significant. With gcc 
4.4.6, comparing perfdirstate on a repository with 210,000 files gives me

without the patch: ! wall 0.147895 comb 0.150000 user 0.130000 sys 
0.020000 (best of 50)
with the patch: ! wall 0.144038 comb 0.150000 user 0.130000 sys 0.020000 
(best of 50)



>
> Refer to https://bugzilla.mozilla.org/show_bug.cgi?id=351557. It can improve
> byte-swapping performance by intrinsic function.
>
> diff --git a/mercurial/util.h b/mercurial/util.h
> --- a/mercurial/util.h
> +++ b/mercurial/util.h
> @@ -151,6 +151,17 @@
>   #define inline __inline
>   #endif
>   
> +#if defined(_MSC_VER) && (_MSC_VER >= 1300)
> +static inline uint32_t getbe32(const char *c)
> +{
> +	return _byteswap_ulong(*(uint32_t *)c);
> +}
> +#elif GCC_VERSION >= 403

gcc doesn't have a predefined GCC_VERSION macro. You need something like

#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)

> +static inline uint32_t getbe32(const char *c)
> +{
> +	return __builtin_bswap32(*(uint32_t *)c);
> +}
> +#else
>   static inline uint32_t getbe32(const char *c)
>   {
>   	const unsigned char *d = (const unsigned char *)c;
> @@ -160,7 +171,21 @@
>   		(d[2] << 8) |
>   		(d[3]));
>   }
> +#endif
>   
> +#if defined(_MSC_VER) && (_MSC_VER >= 1300)
> +static inline void putbe32(uint32_t x, char *c)
> +{
> +	x = _byteswap_ulong(x);
> +	*(uint32_t *)c = x;
> +}
> +#elif GCC_VERSION >= 403

Here too.

I don't know what clang supports.

> +static inline void putbe32(uint32_t x, char *c)
> +{
> +	x = __builtin_bswap32(x);
> +	*(uint32_t *)c = x;
> +}
> +#else
>   static inline void putbe32(uint32_t x, char *c)
>   {
>   	c[0] = (x >> 24) & 0xff;
> @@ -168,5 +193,6 @@
>   	c[2] = (x >> 8) & 0xff;
>   	c[3] = (x) & 0xff;
>   }
> +#endif
>   
>   #endif /* _HG_UTIL_H_ */
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel@selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
Matt Mackall - Sept. 14, 2013, 11:51 p.m.
On Sat, 2013-09-14 at 16:48 -0700, Siddharth Agarwal wrote:
> On 09/13/2013 08:23 PM, elson.wei@gmail.com wrote:
> > # HG changeset patch
> > # User Wei, Elson <elson.wei@gmail.com>
> > # Date 1379128954 -28800
> > #      Sat Sep 14 11:22:34 2013 +0800
> > # Node ID 5e59af7da42418a2d79ad8720d4eb9725851112b
> > # Parent  f3beb092c0285cd118c4af05688495f05aca8f3a
> > util.h: getbe32() and putbe32() use intrinsic function to improve performance
> 
> Have you found an actual performance difference, maybe on MSVC? I tried 
> (with the fix below) but didn't find anything significant. With gcc 
> 4.4.6, comparing perfdirstate on a repository with 210,000 files gives me
> 
> without the patch: ! wall 0.147895 comb 0.150000 user 0.130000 sys 
> 0.020000 (best of 50)
> with the patch: ! wall 0.144038 comb 0.150000 user 0.130000 sys 0.020000 
> (best of 50)

I can't measure it either.
Matt Mackall - Sept. 18, 2013, 6:52 p.m.
On Sat, 2013-09-14 at 11:23 +0800, elson.wei@gmail.com wrote:
> # HG changeset patch
> # User Wei, Elson <elson.wei@gmail.com>
> # Date 1379128954 -28800
> #      Sat Sep 14 11:22:34 2013 +0800
> # Node ID 5e59af7da42418a2d79ad8720d4eb9725851112b
> # Parent  f3beb092c0285cd118c4af05688495f05aca8f3a
> util.h: getbe32() and putbe32() use intrinsic function to improve performance
> 
> Refer to https://bugzilla.mozilla.org/show_bug.cgi?id=351557. It can improve
> byte-swapping performance by intrinsic function.

Queueing this to default despite failure to measure as it appears
obviously correct. Thanks.

Patch

diff --git a/mercurial/util.h b/mercurial/util.h
--- a/mercurial/util.h
+++ b/mercurial/util.h
@@ -151,6 +151,17 @@ 
 #define inline __inline
 #endif
 
+#if defined(_MSC_VER) && (_MSC_VER >= 1300)
+static inline uint32_t getbe32(const char *c)
+{
+	return _byteswap_ulong(*(uint32_t *)c);
+}
+#elif GCC_VERSION >= 403
+static inline uint32_t getbe32(const char *c)
+{
+	return __builtin_bswap32(*(uint32_t *)c);
+}
+#else
 static inline uint32_t getbe32(const char *c)
 {
 	const unsigned char *d = (const unsigned char *)c;
@@ -160,7 +171,21 @@ 
 		(d[2] << 8) |
 		(d[3]));
 }
+#endif
 
+#if defined(_MSC_VER) && (_MSC_VER >= 1300)
+static inline void putbe32(uint32_t x, char *c)
+{
+	x = _byteswap_ulong(x);
+	*(uint32_t *)c = x;
+}
+#elif GCC_VERSION >= 403
+static inline void putbe32(uint32_t x, char *c)
+{
+	x = __builtin_bswap32(x);
+	*(uint32_t *)c = x;
+}
+#else
 static inline void putbe32(uint32_t x, char *c)
 {
 	c[0] = (x >> 24) & 0xff;
@@ -168,5 +193,6 @@ 
 	c[2] = (x >> 8) & 0xff;
 	c[3] = (x) & 0xff;
 }
+#endif
 
 #endif /* _HG_UTIL_H_ */