[PATCHv2] memcpy for ThunderX2

Message ID CY4PR07MB2885AFCD45C697C0522D37BDE4C50@CY4PR07MB2885.namprd07.prod.outlook.com
State New, archived
Headers

Commit Message

Saharoy, Saikat June 20, 2017, 10:06 p.m. UTC
  Hi,

I am resubmitting the memcpy patch for ThunderX2 platform, as given below.
This version of memcpy patch addresses the earlier issues regarding failing glibc tests and too many branches.

The memcpy patch for ThunderX2 uses SIMD instructions to improve performance for large copy sizes.
Standalone execution of Glibc's 'bench-memcpy-random' test on ThunderX2 platform shows expected gain for copy sizes: 4k, 8k, 16k, 32k and 64k using ThunderX2 version of memcpy compared to the default/generic version of memcpy

Results from running "make bench" in Glibc environment however, shows the generic version of memcpy to be faster than simd vesrion of memcpy; I am not sure why the results differ between standalone execution of bench-memcpy-random and running 'make bench' from within Glibc environment, but will appreciate if anyone has any suggestion/comments.


Thanks,

-Roy

saikat.saharoy@caviumnetworks.com
  

Comments

Richard Henderson June 21, 2017, 3:02 a.m. UTC | #1
On 06/20/2017 03:06 PM, Saharoy, Saikat wrote:
>     /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
>     IFUNC_IMPL (i, name, memcpy,
> -	      IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
> -			      __memcpy_thunderx)
> -	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
> +              IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr),
> +                              __memcpy_thunderx2)
> +              IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
> +                              __memcpy_thunderx)
> +              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))

This is for testing.  Therefore you should run all implementations that can be 
run by the host, even those that are not tuned for the host.  I suspect that 
both your new implementation and the thunderx1 implementation, can be run by 
all armv8 hosts.


r~
  

Patch

==============================================================================================================

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 78d52c7..3f00997 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,3 +1,4 @@ 
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2
+CFLAGS-memcpy_thunderx2.c += -O3 -funroll-loops
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index c4f23df..24366fb 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,11 +25,11 @@ 
 #include <stdio.h>
 
 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	2
+#define MAX_IFUNC       3
 
 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-			size_t max)
+                        size_t max)
 {
   assert (max >= MAX_IFUNC);
 
@@ -39,13 +39,17 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
   IFUNC_IMPL (i, name, memcpy,
-	      IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
-			      __memcpy_thunderx)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+              IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr),
+                              __memcpy_thunderx2)
+              IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
+                              __memcpy_thunderx)
+              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
-	      IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX (midr),
-			      __memmove_thunderx)
-	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+              IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr),
+                              __memmove_thunderx2)
+              IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX (midr),
+                              __memmove_thunderx)
+              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 9f73efb..2cdcde0 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,10 +29,15 @@ 
 extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 
 libc_ifunc (__libc_memcpy,
-            IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic);
+            IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) 
+            ? __memcpy_thunderx2 :
+               IS_THUNDERX (midr)                 
+               ? __memcpy_thunderx
+            : __memcpy_generic);
 
 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.c b/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
index e69de29..aac453b 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
@@ -0,0 +1,382 @@ 
+/* Multiple versions of memcpy and memmove. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <arm_neon.h>
+
+extern void *
+simd_memcpy (void * dst, const void * src, size_t bytes);
+
+extern void *
+simd_memmove (void * dst, const void * src, size_t bytes);
+
+#define _ldpq(x, y, p) \
+  asm ("ldp %q0, %q1, [%2]" : "=w" ((x)), "=w" ((y)) : "r" ((p)))
+
+#define _stpq(x, y, p) \
+ asm ("stp %q0, %q1, [%2]" :: "w" ((x)), "w" ((y)), "r" ((p)) : "memory")
+
+#define _ldpr(x, y, p) \
+ asm ("ldp %x0, %x1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stpr(x, y, p) \
+ asm ("stp %x0, %x1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ldp(x, y, p) \
+ asm ("ldp %w0, %w1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stp(x, y, p) \
+ asm ("stp %w0, %w1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ld(x, p) \
+ asm ("ldr %w0, [%1]" : "=r" ((x)) : "r" ((p)))
+
+#define _st(x, p) \
+ asm ("str %w0, [%1]" :: "r" ((x)), "r" ((p)) : "memory")
+
+
+/* Copy 64 bytes using ldp/stp */
+static inline void
+memcpy_64bytes (void * dst, const void * src)
+{
+ int32x4_t u0, v0, u1, v1;
+ _ldpq (u0, v0, src);
+ _stpq (u0, v0, dst);
+ _ldpq (u1, v1, (src + 32));
+ _stpq (u1, v1, (dst + 32));
+}
+
+/* Copy 32 bytes using ldp/stp. */
+static inline void
+memcpy_32bytes (void * dst, const void * src)
+{
+ int32x4_t u, v;
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+/* Copy 16bytes using ldr/str.  */
+static inline void
+memcpy_16bytes (void * dst, const void * src)
+{
+ vst1q_s32 (dst, vld1q_s32(src));
+}
+
+/* Copy 8bytes using ldp/stp. */
+static inline void
+memcpy_8bytes (void * dst, const void * src)
+{
+ int32_t u, v;
+ _ldp (u, v, src);
+ _stp (u, v, dst);
+}
+
+/* Copy 4bytes using ldr/src */
+static inline void
+memcpy_4bytes (void * dst, const void * src)
+{
+ int32_t u;
+ _ld (u, src);
+ _st (u, dst);
+}
+
+static inline
+void copy_fwd_bytes (char * dst, const char * src, size_t sz)
+{
+ while (sz--)
+   *dst++ = *src++;
+}
+
+static inline
+void copy_reverse_bytes (char * dst, const char *src, size_t sz)
+{
+ dst += sz;
+ src += sz;
+ while (sz--)
+   *--dst = *--src;
+}
+
+/* Copy up to 16 bytes */
+static inline void
+memcpy_upto_16bytes (void * dst, const void * src, size_t bytes)
+{
+ if (bytes == 16)
+   {
+     memcpy_16bytes (dst, src);
+     return;
+   }
+ size_t cnt = bytes;
+ if (cnt >= 8)
+   {
+     memcpy_8bytes (dst, src);
+     cnt -= 8;
+     if (cnt == 0)
+       return;
+     memcpy_8bytes (dst + (bytes - 8), src + (bytes - 8));
+     return;
+   }
+ if (cnt >= 4)
+   {
+     memcpy_4bytes (dst, src);
+     cnt -= 4;
+     if (cnt == 0)
+       return;
+     memcpy_4bytes (dst + (bytes - 4), src + (bytes - 4));
+     return;
+   }
+
+ size_t tmp = cnt >> 1;
+
+ *(char *)dst = *(char *)src;
+ *((char *)dst + tmp) = *((char *)src + tmp);
+ *((char *)dst + (bytes - 1)) = *((char *)src + (bytes -1));
+}
+
+static inline void
+memcpy_upto_96bytes (void * dst, const void * src, size_t bytes)
+{
+ /* Assumption: copy size is greater than 16 bytes */
+ size_t rem;
+ size_t i;
+
+ if (!(bytes & 31))
+   {
+     for (i=0; i < (bytes & ~31); i+=32)
+        memcpy_32bytes (dst + i, src + i);
+
+     return;
+   }
+
+ if (bytes > 32)
+   {
+     rem = bytes & 31;
+     for (i=0; i < (bytes & ~31); i+=32)
+        memcpy_32bytes (dst + i, src + i);
+     
+     memcpy_32bytes (dst + (bytes - 32), src + (bytes - 32));
+
+     return;
+   }
+
+ rem = bytes & 15;
+ for (i=0; i < (bytes & ~15); i+=16)
+    memcpy_16bytes (dst + i, src + i);
+
+ if (rem == 0)
+   return;
+
+ memcpy_16bytes (dst + (bytes - 16), src + (bytes - 16));
+}
+
+static inline void
+memcpy_long (void * dst, const void * src, size_t bytes)
+{
+ if (bytes >= 512 && ((size_t)src & 15))
+   {
+     size_t align_bytes = 16 - ((size_t)src & 15);
+     memcpy_16bytes (dst, src);
+     bytes -= align_bytes;
+     src += align_bytes;
+     dst += align_bytes;
+   }
+
+ size_t i;
+ size_t cnt = bytes & ~63;
+ 
+ for (i=0; i < cnt; i+=64)
+    memcpy_64bytes (dst + i, src + i);
+
+ if (bytes == cnt)
+   return;
+
+ memcpy_64bytes (dst + (bytes - 64), src + (bytes - 64));
+}
+
+static inline
+void * simd_copy_fwd (void * dst, const void * src, size_t sz)
+{
+
+ void * dst_orig = dst;
+ size_t rem = sz;
+ size_t cnt;
+
+ if (sz >= 512 && ((size_t)src & 15))
+   {
+     size_t align_bytes = 16 - ((size_t)src & 15);
+     copy_fwd_bytes (dst, src, align_bytes);
+     rem -= align_bytes;
+     src += align_bytes;
+     dst += align_bytes;
+   }
+
+ size_t N_16bytes = rem >> 4;
+
+ if (N_16bytes) 
+   {
+     for (cnt = 0; cnt < (N_16bytes << 4); cnt+=16)
+        memcpy_16bytes (dst + cnt, src + cnt);
+
+     rem -= (N_16bytes << 4);
+     if (rem == 0)
+       return dst_orig;
+     dst += (N_16bytes << 4);
+     src += (N_16bytes << 4);
+   }
+
+ size_t N_8bytes = rem >> 3;
+
+ if (N_8bytes) 
+   {
+     for (cnt = 0; cnt < (N_8bytes << 3); cnt+=8)
+        memcpy_8bytes (dst + cnt, src + cnt);
+
+     rem -= (N_8bytes << 3);
+     if (rem == 0)
+       return dst_orig;
+     dst += (N_8bytes << 3);
+     src += (N_8bytes << 3);
+   }
+
+ size_t N_4bytes = rem >> 2;
+
+ if (N_4bytes) 
+   {
+     for (cnt = 0; cnt < (N_4bytes << 2); cnt+=4)
+        memcpy_4bytes (dst + cnt, src + cnt);
+
+     rem -= (N_4bytes << 2);
+     if (rem == 0)
+       return dst_orig;
+     dst += (N_4bytes << 2);
+     src += (N_4bytes << 2);
+   }
+
+ copy_fwd_bytes (dst, src, rem);
+ return dst_orig;
+}
+
+static inline
+void *
+simd_copy_bak (void * dst, const void * src, size_t sz)
+{
+ void * dst_orig = dst;
+
+ size_t rem = sz;
+ size_t j, cnt;
+
+ if (rem <= 4)
+   goto L_le_4b;
+
+
+ if (rem >> 4) /* there are at least 16 bytes to move */
+   {
+     if (sz >= 512)
+       {
+         size_t align_bytes = ((uint64_t)src + rem) & 15;
+         if (align_bytes)
+           {
+             size_t i;
+             char * dstend = dst + (rem - 1);
+             const char * srcend = src + (rem - 1); 
+
+             for (i=0; i < align_bytes; i++)
+                 *(dstend - i) = *(srcend - i);
+
+             rem -= align_bytes;
+           }
+       }
+
+     char * dstend = dst + rem;
+     const char * srcend = src + rem;
+
+     for (j=16, cnt=0; cnt < (rem & ~15); j+=16, cnt+=16)
+         memcpy_16bytes (dstend - j, srcend - j);
+
+     rem -= (rem & ~15);
+     if (rem == 0)
+       return dst_orig;
+   }
+
+ if (rem >> 3) /* there are at most 15 bytes to move */
+   {
+     memcpy_8bytes (dst + (rem - 8), src + (rem - 8));
+     rem -= 8;
+     if (rem == 0)
+       return dst_orig;
+   }
+
+L_le_4b:
+
+ if (rem >> 2) /* there are at most 7 bytes to move */
+   {
+     memcpy_4bytes (dst + (rem - 4), src + (rem - 4));
+     rem -= 4;
+     if (rem == 0)
+       return dst_orig;
+   }
+
+ /* copy less than 4 bytes */
+ copy_reverse_bytes (dst, src, rem);
+
+ return dst;
+}
+
+
+/* glibc memcpy function follows */
+void *
+__memcpy_thunderx2 (void * dst, const void * src, size_t bytes)
+{
+ if (dst == src || bytes == 0)
+   return dst;
+
+ if (bytes > 96)
+   {
+     memcpy_long (dst, src, bytes);
+     return dst;
+   }
+
+ if (bytes <= 16)
+   {
+     memcpy_upto_16bytes (dst, src, bytes);
+     return dst;
+   }
+
+ if (bytes <= 96)
+   {
+     memcpy_upto_96bytes (dst, src, bytes);
+     return dst;
+   }
+
+ return dst;
+}
+
+
+void *
+__memmove_thunderx2 (void * dst, const void * src, size_t bytes)
+{
+ if (dst == src || bytes == 0)
+   return dst;
+
+ if (src < dst)
+   return simd_copy_bak (dst, src, bytes);
+ else
+   return simd_copy_fwd (dst, src, bytes);
+}
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 34c6b29..5692230 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,10 +29,15 @@ 
 extern __typeof (__redirect_memmove) __libc_memmove;
 
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 
 libc_ifunc (__libc_memmove,
-            IS_THUNDERX (midr) ? __memmove_thunderx : __memmove_generic);
+            IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+            ? __memmove_thunderx2 :
+               IS_THUNDERX (midr)
+               ? __memmove_thunderx
+            : __memmove_generic);
 
 # undef memmove
 strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index c92b650..45ddce6 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -40,6 +40,10 @@ 
 
 #define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C'	\
 			   && MIDR_PARTNUM(midr) == 0x0a1)
+#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B'     \
+                           && MIDR_PARTNUM(midr) == 0x516)
+#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C'       \
+                           && MIDR_PARTNUM(midr) == 0xaf)