diff mbox

[PATCHv3] memcpy for ThunderX2

Message ID CY4PR07MB288500C8E3A35942A183F894E4DA0@CY4PR07MB2885.namprd07.prod.outlook.com
State New, archived
Headers show

Commit Message

Saharoy, Saikat June 21, 2017, 11:38 p.m. UTC
Hi,

I am resubmitting the memcpy patch (v3) for ThunderX2 platform, as given below.

The changes in this version are as follows:
- Implemented comment from Richard Henderson: Fixed ifunc-impl-list.c to allow running/testing of all memcpy implementations supported on host
- Removed an unused assignment in inline function memcpy_upto_96bytes in memcpy_thunderx2.c

Thanks,

-Roy

saikat.saharoy@caviumnetworks.com
diff mbox

Patch

========================================================================================================================

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 78d52c7..3f00997 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,3 +1,4 @@ 
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2
+CFLAGS-memcpy_thunderx2.c += -O3 -funroll-loops
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index c4f23df..baf09da 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,11 +25,11 @@ 
 #include <stdio.h>
 
 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	2
+#define MAX_IFUNC       3
 
 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-			size_t max)
+                        size_t max)
 {
   assert (max >= MAX_IFUNC);
 
@@ -39,13 +39,13 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
   IFUNC_IMPL (i, name, memcpy,
-	      IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
-			      __memcpy_thunderx)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
+              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+              IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
-	      IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX (midr),
-			      __memmove_thunderx)
-	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
+              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+              IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
 
   return i;
 }
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 9f73efb..2cdcde0 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,10 +29,15 @@ 
 extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 
 libc_ifunc (__libc_memcpy,
-            IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic);
+            IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) 
+            ? __memcpy_thunderx2 :
+               IS_THUNDERX (midr)                 
+               ? __memcpy_thunderx
+            : __memcpy_generic);
 
 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.c b/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
index e69de29..ffb4d60 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
@@ -0,0 +1,380 @@ 
+/* Multiple versions of memcpy and memmove. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <arm_neon.h>
+
+extern void *
+simd_memcpy (void * dst, const void * src, size_t bytes);
+
+extern void *
+simd_memmove (void * dst, const void * src, size_t bytes);
+
+#define _ldpq(x, y, p) \
+  asm ("ldp %q0, %q1, [%2]" : "=w" ((x)), "=w" ((y)) : "r" ((p)))
+
+#define _stpq(x, y, p) \
+ asm ("stp %q0, %q1, [%2]" :: "w" ((x)), "w" ((y)), "r" ((p)) : "memory")
+
+#define _ldpr(x, y, p) \
+ asm ("ldp %x0, %x1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stpr(x, y, p) \
+ asm ("stp %x0, %x1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ldp(x, y, p) \
+ asm ("ldp %w0, %w1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stp(x, y, p) \
+ asm ("stp %w0, %w1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ld(x, p) \
+ asm ("ldr %w0, [%1]" : "=r" ((x)) : "r" ((p)))
+
+#define _st(x, p) \
+ asm ("str %w0, [%1]" :: "r" ((x)), "r" ((p)) : "memory")
+
+
+/* Copy 64 bytes using ldp/stp */
+static inline void
+memcpy_64bytes (void * dst, const void * src)
+{
+ int32x4_t u0, v0, u1, v1;
+ _ldpq (u0, v0, src);
+ _stpq (u0, v0, dst);
+ _ldpq (u1, v1, (src + 32));
+ _stpq (u1, v1, (dst + 32));
+}
+
+/* Copy 32 bytes using ldp/stp. */
+static inline void
+memcpy_32bytes (void * dst, const void * src)
+{
+ int32x4_t u, v;
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+/* Copy 16bytes using ldr/str.  */
+static inline void
+memcpy_16bytes (void * dst, const void * src)
+{
+ vst1q_s32 (dst, vld1q_s32(src));
+}
+
+/* Copy 8bytes using ldp/stp. */
+static inline void
+memcpy_8bytes (void * dst, const void * src)
+{
+ int32_t u, v;
+ _ldp (u, v, src);
+ _stp (u, v, dst);
+}
+
+/* Copy 4bytes using ldr/src */
+static inline void
+memcpy_4bytes (void * dst, const void * src)
+{
+ int32_t u;
+ _ld (u, src);
+ _st (u, dst);
+}
+
+static inline
+void copy_fwd_bytes (char * dst, const char * src, size_t sz)
+{
+ while (sz--)
+   *dst++ = *src++;
+}
+
+static inline
+void copy_reverse_bytes (char * dst, const char *src, size_t sz)
+{
+ dst += sz;
+ src += sz;
+ while (sz--)
+   *--dst = *--src;
+}
+
+/* Copy up to 16 bytes */
+static inline void
+memcpy_upto_16bytes (void * dst, const void * src, size_t bytes)
+{
+ if (bytes == 16)
+   {
+     memcpy_16bytes (dst, src);
+     return;
+   }
+ size_t cnt = bytes;
+ if (cnt >= 8)
+   {
+     memcpy_8bytes (dst, src);
+     cnt -= 8;
+     if (cnt == 0)
+       return;
+     memcpy_8bytes (dst + (bytes - 8), src + (bytes - 8));
+     return;
+   }
+ if (cnt >= 4)
+   {
+     memcpy_4bytes (dst, src);
+     cnt -= 4;
+     if (cnt == 0)
+       return;
+     memcpy_4bytes (dst + (bytes - 4), src + (bytes - 4));
+     return;
+   }
+
+ size_t tmp = cnt >> 1;
+
+ *(char *)dst = *(char *)src;
+ *((char *)dst + tmp) = *((char *)src + tmp);
+ *((char *)dst + (bytes - 1)) = *((char *)src + (bytes -1));
+}
+
+static inline void
+memcpy_upto_96bytes (void * dst, const void * src, size_t bytes)
+{
+ /* Assumption: copy size is greater than 16 bytes */
+ size_t rem;
+ size_t i;
+
+ if (!(bytes & 31))
+   {
+     for (i=0; i < (bytes & ~31); i+=32)
+        memcpy_32bytes (dst + i, src + i);
+
+     return;
+   }
+
+ if (bytes > 32)
+   {
+     for (i=0; i < (bytes & ~31); i+=32)
+        memcpy_32bytes (dst + i, src + i);
+     
+     memcpy_32bytes (dst + (bytes - 32), src + (bytes - 32));
+
+     return;
+   }
+
+ rem = bytes & 15;
+ for (i=0; i < (bytes & ~15); i+=16)
+    memcpy_16bytes (dst + i, src + i);
+
+ if (rem == 0)
+   return;
+
+ memcpy_16bytes (dst + (bytes - 16), src + (bytes - 16));
+}
+
+static inline void
+memcpy_long (void * dst, const void * src, size_t bytes)
+{
+ if (bytes >= 512 && ((size_t)src & 15))
+   {
+     size_t align_bytes = 16 - ((size_t)src & 15);
+     memcpy_16bytes (dst, src);
+     bytes -= align_bytes;
+     src += align_bytes;
+     dst += align_bytes;
+   }
+
+ size_t i;
+ size_t cnt = bytes & ~63;
+ 
+ for (i=0; i < cnt; i+=64)
+    memcpy_64bytes (dst + i, src + i);
+
+ if (bytes == cnt)
+   return;
+
+ memcpy_64bytes (dst + (bytes - 64), src + (bytes - 64));
+}
+
+static inline
+void * simd_copy_fwd (void * dst, const void * src, size_t sz)
+{
+
+ void * dst_orig = dst;
+ size_t rem = sz;
+ size_t cnt;
+
+ if (sz >= 512 && ((size_t)src & 15))
+   {
+     size_t align_bytes = 16 - ((size_t)src & 15);
+     copy_fwd_bytes (dst, src, align_bytes);
+     rem -= align_bytes;
+     src += align_bytes;
+     dst += align_bytes;
+   }
+
+ size_t N_16bytes = rem >> 4;
+
+ if (N_16bytes) 
+   {
+     for (cnt = 0; cnt < (N_16bytes << 4); cnt+=16)
+        memcpy_16bytes (dst + cnt, src + cnt);
+
+     rem -= (N_16bytes << 4);
+     if (rem == 0)
+       return dst_orig;
+     dst += (N_16bytes << 4);
+     src += (N_16bytes << 4);
+   }
+
+ size_t N_8bytes = rem >> 3;
+
+ if (N_8bytes) 
+   {
+     for (cnt = 0; cnt < (N_8bytes << 3); cnt+=8)
+        memcpy_8bytes (dst + cnt, src + cnt);
+
+     rem -= (N_8bytes << 3);
+     if (rem == 0)
+       return dst_orig;
+     dst += (N_8bytes << 3);
+     src += (N_8bytes << 3);
+   }
+
+ size_t N_4bytes = rem >> 2;
+
+ if (N_4bytes) 
+   {
+     for (cnt = 0; cnt < (N_4bytes << 2); cnt+=4)
+        memcpy_4bytes (dst + cnt, src + cnt);
+
+     rem -= (N_4bytes << 2);
+     if (rem == 0)
+       return dst_orig;
+     dst += (N_4bytes << 2);
+     src += (N_4bytes << 2);
+   }
+
+ copy_fwd_bytes (dst, src, rem);
+ return dst_orig;
+}
+
+static inline
+void *
+simd_copy_bak (void * dst, const void * src, size_t sz)
+{
+ void * dst_orig = dst;
+
+ size_t rem = sz;
+ size_t j, cnt;
+
+ if (rem <= 4)
+   goto L_le_4b;
+
+
+ if (rem >> 4) /* there are at least 16 bytes to move */
+   {
+     if (sz >= 512)
+       {
+         size_t align_bytes = ((uint64_t)src + rem) & 15;
+         if (align_bytes)
+           {
+             size_t i;
+             char * dstend = dst + (rem - 1);
+             const char * srcend = src + (rem - 1); 
+
+             for (i=0; i < align_bytes; i++)
+                 *(dstend - i) = *(srcend - i);
+
+             rem -= align_bytes;
+           }
+       }
+
+     char * dstend = dst + rem;
+     const char * srcend = src + rem;
+
+     for (j=16, cnt=0; cnt < (rem & ~15); j+=16, cnt+=16)
+         memcpy_16bytes (dstend - j, srcend - j);
+
+     rem -= (rem & ~15);
+     if (rem == 0)
+       return dst_orig;
+   }
+
+ if (rem >> 3) /* there are at most 15 bytes to move */
+   {
+     memcpy_8bytes (dst + (rem - 8), src + (rem - 8));
+     rem -= 8;
+     if (rem == 0)
+       return dst_orig;
+   }
+
+L_le_4b:
+
+ if (rem >> 2) /* there are at most 7 bytes to move */
+   {
+     memcpy_4bytes (dst + (rem - 4), src + (rem - 4));
+     rem -= 4;
+     if (rem == 0)
+       return dst_orig;
+   }
+
+ /* copy less than 4 bytes */
+ copy_reverse_bytes (dst, src, rem);
+
+ return dst;
+}
+
+
+/* glibc memcpy function follows */
+void *
+__memcpy_thunderx2 (void * dst, const void * src, size_t bytes)
+{
+ if (dst == src || bytes == 0)
+   return dst;
+
+ if (bytes > 96)
+   {
+     memcpy_long (dst, src, bytes);
+     return dst;
+   }
+
+ if (bytes <= 16)
+   {
+     memcpy_upto_16bytes (dst, src, bytes);
+     return dst;
+   }
+
+ if (bytes <= 96)
+   {
+     memcpy_upto_96bytes (dst, src, bytes);
+     return dst;
+   }
+ return dst;
+}
+
+
+void *
+__memmove_thunderx2 (void * dst, const void * src, size_t bytes)
+{
+ if (dst == src || bytes == 0)
+   return dst;
+
+ if (src < dst)
+   return simd_copy_bak (dst, src, bytes);
+ else
+   return simd_copy_fwd (dst, src, bytes);
+}
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 34c6b29..5692230 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,10 +29,15 @@ 
 extern __typeof (__redirect_memmove) __libc_memmove;
 
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 
 libc_ifunc (__libc_memmove,
-            IS_THUNDERX (midr) ? __memmove_thunderx : __memmove_generic);
+            IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+            ? __memmove_thunderx2 :
+               IS_THUNDERX (midr)
+               ? __memmove_thunderx
+            : __memmove_generic);
 
 # undef memmove
 strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index c92b650..45ddce6 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -40,6 +40,10 @@ 
 
 #define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C'	\
 			   && MIDR_PARTNUM(midr) == 0x0a1)
+#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B'     \
+                           && MIDR_PARTNUM(midr) == 0x516)
+#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C'       \
+                           && MIDR_PARTNUM(midr) == 0xaf)