Patchwork aarch64: ThunderX2 specific memcpy and memmove

login
register
mail settings
Submitter Saharoy, Saikat
Date May 26, 2017, 9:49 p.m.
Message ID <CY4PR07MB2885361CC2DF7CB09F65BFA3E4FC0@CY4PR07MB2885.namprd07.prod.outlook.com>
Download mbox | patch
Permalink /patch/20611/
State New
Headers show

Comments

Saharoy, Saikat - May 26, 2017, 9:49 p.m.
With the IFUNC infrastructure for aarch64 in place, here is a
 patch to add Thunderx2 specific versions of memcpy and
 memmove.

 The ThunderX2 version of memcpy and memmove use SIMD
 instructions and paired loads/stores to obtain improved performance
 for suitable copy sizes, especially for large (> 128 KB) sizes.

 If people think we should use a separate ThunderX2 version of memcpy
 (and memmove) for all aarch64 systems, I will be happy to drop this patch.
 The primary change is addition of a new file memcpy_thunderx2.c.
 Other minor changes are to support ThunderX2 specific functions for
 memcpy and memmove.

 Thanks,

 Saikat Saharoy
 saikat.saharoy@cavium.com


 ChangeLog entry:

         * sysdeps/aarch64/multiarch/memcpy_thunderx2.c: New file.
         * sysdeps/aarch64/multiarch/Makefile:
               Include _memcpy_thunderx2 and specific optimizations
         * sysdeps/aarch64/multiarch/memcpy.c:
               Use ThunderX2 memcpy function
         * sysdeps/aarch64/multiarch/memmove.c: Likewise.
         * sysdeps/aarch64/multiarch/ifunc-impl-list.c:
               Add ThunderX2 memcpy and memmove in IFUNC implementations
         * sysdeps/unix/sysv/linux/aarch64/cpu-features.h:
               Add checks for ThunderX2 and ThunderX2 Pass A platforms

Patch

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 78d52c7..25b37de 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,3 +1,4 @@ 
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2
+CFLAGS-memcpy_thunderx2.c += -O3 -funroll-loops -fPIC
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index c4f23df..ee6f3ea 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,7 +25,7 @@ 
 #include <stdio.h>
 
 /* Maximum number of IFUNC implementations.  */
-#define MAX_IFUNC	2
+#define MAX_IFUNC	3
 
 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -39,10 +39,16 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c.  */
   IFUNC_IMPL (i, name, memcpy,
+	      IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr),
+			      __memcpy_thunderx2)
 	      IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
 			      __memcpy_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
+	      IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX2 (midr),
+			      __memmove_thunderx2)
+	      IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX2PA (midr),
+			      __memmove_thunderx2)
 	      IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX (midr),
 			      __memmove_thunderx)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 9f73efb..2cdcde0 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,10 +29,15 @@ 
 extern __typeof (__redirect_memcpy) __libc_memcpy;
 
 extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
 
 libc_ifunc (__libc_memcpy,
-            IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic);
+            IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) 
+            ? __memcpy_thunderx2 :
+               IS_THUNDERX (midr)                 
+               ? __memcpy_thunderx
+            : __memcpy_generic);
 
 # undef memcpy
 strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.c b/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
index e69de29..cc0fb9e 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
@@ -0,0 +1,764 @@ 
+/* Multiple versions of memcpy and memmove. AARCH64 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <arm_neon.h>
+#include <string.h>
+
+#define _ldpq(x, y, p) \
+ asm ("ldp %q0, %q1, [%2]" : "=w" ((x)), "=w" ((y)) : "r" ((p)))
+
+#define _stpq(x, y, p) \
+ asm ("stp %q0, %q1, [%2]" :: "w" ((x)), "w" ((y)), "r" ((p)) : "memory")
+
+#define _ldpr(x, y, p) \
+ asm ("ldp %x0, %x1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stpr(x, y, p) \
+ asm ("stp %x0, %x1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ldp(x, y, p) \
+ asm ("ldp %w0, %w1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stp(x, y, p) \
+ asm ("stp %w0, %w1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ld(x, p) \
+ asm ("ldr %w0, [%1]" : "=r" ((x)) : "r" ((p)))
+
+#define _st(x, p) \
+ asm ("str %w0, [%1]" :: "r" ((x)), "r" ((p)) : "memory")
+
+#define _memcpy_fixed_simd(dst, src, N_16bytes) \
+  do { \
+    int _i; \
+    size_t _offset = 0; \
+    for (_i=0; _i < (N_16bytes); _i++, _offset += 16) \
+      { \
+	vst1q_s32 ((dst) + _offset, vld1q_s32 ((src) + _offset)); \
+      } \
+  } while (0)
+
+/* Copy 32 bytes using ldp/stp. */
+static inline void
+memcpy_32bytes (void * dst, void * src)
+{
+ int32x4_t u, v;
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+/* Copy 16bytes using ldr/str.  */
+static inline void
+memcpy_16bytes (void * dst, void * src)
+{
+ vst1q_s32 (dst, vld1q_s32(src));
+}
+
+
+/* Copy 8bytes using ldp/stp. */
+static inline void
+memcpy_8bytes (void * dst, void * src)
+{
+ int32_t u, v;
+ _ldp (u, v, src);
+ _stp (u, v, dst);
+}
+
+/* Copy 4bytes using ldr/src */ 
+static inline void
+memcpy_4bytes (void * dst, void * src)
+{
+ int32_t u;
+ _ld (u, src);
+ _st (u, dst);
+}
+
+static inline void
+memcpy_128bytes (void * dst, void * src)
+{
+ int32x4_t u8, v8;
+ int32x4_t u16, v16;
+ int32x4_t u24, v24;
+ int32x4_t u32, v32;
+
+ _ldpq (u8, v8, src);
+ _stpq (u8, v8, dst);
+
+ _ldpq (u16, v16, src + 32);
+ _stpq (u16, v16, dst + 32);
+
+ _ldpq (u24, v24, src + 64);
+ _stpq (u24, v24, dst + 64);
+
+ _ldpq (u32, v32, src + 96);
+ _stpq (u32, v32, dst + 96);
+}
+
+/* Copy in 128bytes stride */ 
+static inline void
+memcpy_128byte_multiple (void * dst, void * src, size_t bytes)
+{
+ size_t i = 0;
+ int32x4_t u8, v8;
+ int32x4_t u16, v16;
+ int32x4_t u24, v24;
+ int32x4_t u32, v32;
+
+ __builtin_prefetch (src + 256, 0, 3);
+ __builtin_prefetch (src + 512, 0, 3);
+
+ do { 
+   _ldpq (u8, v8, src + i);
+   _stpq (u8, v8, dst + i);
+
+   _ldpq (u16, v16, src + i + 32);
+   _stpq (u16, v16, dst + i + 32);
+
+   _ldpq (u24, v24, src + i + 64);
+   _stpq (u24, v24, dst + i + 64);
+
+   _ldpq (u32, v32, src + i + 96);
+   _stpq (u32, v32, dst + i + 96);
+ } while ((i += 128) < bytes);
+}
+
+/* Copy in 32bytes stride */
+static inline void 
+memcpy_32byte_multiple (void * dst, void * src, size_t bytes)
+{
+  size_t i = 0;
+  int32x4_t u, v;
+
+  __builtin_prefetch(src + 256, 0, 3);
+  __builtin_prefetch (src + 512, 0, 3);
+
+ do {
+   _ldpq (u, v, src + i);
+   _stpq (u, v, dst + i);
+ } while ((i += 32) < bytes);
+}
+
+
+/* Copy any size over 128bytes */
+static inline void 
+memcpy_any_over_128bytes (void * dst, void * src, size_t bytes)
+{
+ memcpy_128byte_multiple (dst, src, bytes & ~127);
+
+ size_t rem = bytes & 127;
+
+ if (rem > 32)
+   {
+     memcpy_128bytes (dst + (bytes - 128), src + (bytes - 128));
+     return;
+   }
+
+ if (rem > 16)
+   {
+     memcpy_32bytes (dst + (bytes - 32), src + (bytes - 32));
+     return;
+   }
+
+ /* max size of remainder is 16 bytes */
+ memcpy_16bytes (dst + (bytes - 16), src + (bytes - 16));
+}
+
+/* Copy any size over 32bytes */
+static inline void 
+memcpy_any_over_32bytes (void * dst, void * src, size_t bytes)
+{
+ memcpy_32byte_multiple (dst, src, bytes & ~31);
+
+ memcpy_32bytes (dst + (bytes - 32), src + (bytes - 32));
+}
+
+
+/* Copy up to 16bytes */
+static inline void 
+memcpy_upto_16bytes (void * dst, void * src, size_t bytes)
+{
+ if (bytes == 16)
+   {
+     memcpy_16bytes (dst, src);
+     return;
+   }
+
+ size_t rem = bytes;
+ size_t index;
+
+ if (bytes >= 8)
+   {
+     goto L_mc_8;
+   }
+ if (bytes == 4)
+   {
+     memcpy_4bytes (dst, src);
+     return;
+   }
+
+ if (bytes == 1)
+   {
+     ((char *)dst)[0] = ((char *)src)[0];
+     return;
+   }
+ if (bytes == 2)
+ {
+        ((char *)dst)[1] = ((char *)src)[1];
+        ((char *)dst)[0] = ((char *)src)[0];
+        return;
+ }
+ if (bytes == 3)
+   {
+     ((char *)dst)[2] = ((char *)src)[2];
+     ((char *)dst)[1] = ((char *)src)[1];
+     ((char *)dst)[0] = ((char *)src)[0];
+     return;
+   }
+
+ if (__glibc_unlikely(bytes == 0))
+   return;
+
+ goto L_mc_4;
+
+ L_mc_8:
+  index = bytes - 8;
+  memcpy_8bytes (dst + index, src + index);
+
+  rem -= 8;
+  if (rem == 0)
+    return;
+
+  if ((bytes - 8) > 4)
+    memcpy_8bytes(dst, src);
+  else
+    memcpy_4bytes(dst, src);
+
+ return;
+
+ L_mc_4:
+  index = bytes - 4;
+  memcpy_4bytes (dst + index, src + index);
+
+  rem -= 4;
+  if (rem == 0)
+    return;
+
+  memcpy_4bytes(dst, src);
+}
+
+
+static inline bool
+match_fixed_sizes (void * dst, void * src, size_t bytes)
+{
+ if (bytes & 15)
+   return false;
+
+ if (!(bytes & 127))
+   {
+    memcpy_128byte_multiple (dst, src, bytes);
+    return true;
+   }
+
+ if (!(bytes & 31))
+   {
+    memcpy_32byte_multiple (dst, src, bytes);
+    return true;
+   }
+
+ switch (bytes)
+ {
+  case 48:
+           memcpy_32bytes (dst, src);
+           memcpy_16bytes (dst + 32, src + 32);
+           return true;
+  case 80:
+           _memcpy_fixed_simd (dst, src, 5);
+           return true;
+  case 192:
+           _memcpy_fixed_simd (dst, src, 12);
+           return true;
+  default:
+	   return false;
+ }
+}
+
+
+static inline void
+memcpy_small_sizes (void * dst, void * src, size_t bytes)
+{
+ if (bytes > 128)
+   {
+     memcpy_any_over_128bytes (dst, src, bytes);
+     return;
+   }
+ if (bytes > 32)
+   {
+     memcpy_any_over_32bytes (dst, src, bytes);
+     return;
+   }
+ memcpy_16bytes (dst, src);
+ memcpy_16bytes (dst + (bytes - 16), src + (bytes - 16));
+}
+
+
+/* Align source address to 16-byte boundary */
+static inline void 
+memalign_simd (char ** dst, char ** src, size_t * bytes)
+{
+ size_t align_bytes = 16 - ((uint64_t)(*src) & 0xf);
+
+ /* size is greater than 16bytes, so perform SIMD copy of 16bytes */
+ vst1q_s32 ((int32_t *)(*dst), vld1q_s32((int32_t *)(*src)));
+
+ *src += align_bytes;
+ *dst += align_bytes;
+ *bytes -= align_bytes;
+}
+
+/* glibc memcpy function follows */
+void
+__memcpy_thunderx2 (void * dst, void * src, size_t bytes)
+{
+ __builtin_prefetch (src, 0, 3);
+ __builtin_prefetch (dst, 1, 3);
+
+ if (bytes <= 16)
+   {
+     memcpy_upto_16bytes (dst, src, bytes);
+     return;
+   }
+
+ if (bytes < 512)
+   {
+     if (!match_fixed_sizes (dst, src, bytes))
+       memcpy_small_sizes (dst, src, bytes);
+
+     return;
+   }
+
+ if (((uint64_t)(src) & 0xf))
+   {
+     memalign_simd ((char **)&dst, (char **)&src, &bytes);
+   }
+
+ if (!(bytes & 0x7f))
+   { /* copy multiple of 128 bytes */
+     memcpy_128byte_multiple (dst, src, bytes);
+     return;
+   }
+
+ /* handle odd sizes over 128 bytes */
+ memcpy_any_over_128bytes (dst, src, bytes);
+}
+
+/* Below is inline version of simd memcpy used by simd_memmove */
+static inline void 
+int_simd_memcpy (void * dst, void * src, size_t bytes)
+{
+ if (bytes <= 16)
+   {
+     memcpy_upto_16bytes (dst, src, bytes);
+     return;
+   }
+
+
+ if (bytes < 512)
+   {
+     if (!match_fixed_sizes (dst, src, bytes))
+       memcpy_small_sizes (dst, src, bytes);
+
+     return;
+   }
+
+ if (((uint64_t)(src) & 0xf))
+   {
+     memalign_simd ((char **)&dst, (char **)&src, &bytes);
+   }
+
+ if (!(bytes & 0x7f))
+ { /* multiple of 128 bytes */
+   memcpy_128byte_multiple (dst, src, bytes);
+   return;
+ }
+
+ /* handle odd sizes over 128 bytes */
+ memcpy_any_over_128bytes (dst, src, bytes);
+}
+
+// Memmove functions
+
+static inline void 
+memmove_32byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+
+static inline void
+memmove_64byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+
+static inline void 
+memmove_96byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+
+}
+
+
+static inline void 
+memmove_128byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+ int32x4_t u3, v3;
+
+ _ldpq (u3, v3, src + 96);
+ _stpq (u3, v3, dst + 96);
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+
+}
+
+static inline void 
+memmove_192byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+ int32x4_t u3, v3;
+ int32x4_t u4, v4;
+ int32x4_t u5, v5;
+
+ _ldpq (u5, v5, src + 160);
+ _stpq (u5, v5, dst + 160);
+
+ _ldpq (u4, v4, src + 128);
+ _stpq (u4, v4, dst + 128);
+
+ _ldpq (u3, v3, src + 96);
+ _stpq (u3, v3, dst + 96);
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+
+static inline void 
+memmove_256byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+ int32x4_t u3, v3;
+ int32x4_t u4, v4;
+ int32x4_t u5, v5;
+ int32x4_t u6, v6;
+ int32x4_t u7, v7;
+
+ __builtin_prefetch (src + 256, 0, 3);
+
+ _ldpq (u7, v7, src + 224);
+ _stpq (u7, v7, dst + 224);
+
+ _ldpq (u6, v6, src + 192);
+ _stpq (u6, v6, dst + 192);
+
+ _ldpq (u5, v5, src + 160);
+ _stpq (u5, v5, dst + 160);
+
+ _ldpq (u4, v4, src + 128);
+ _stpq (u4, v4, dst + 128);
+
+ _ldpq (u3, v3, src + 96);
+ _stpq (u3, v3, dst + 96);
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+
+static inline void 
+memmove_512byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+ int32x4_t u3, v3;
+ int32x4_t u4, v4;
+ int32x4_t u5, v5;
+ int32x4_t u6, v6;
+ int32x4_t u7, v7;
+
+ int32x4_t u8, v8;
+ int32x4_t u9, v9;
+ int32x4_t u10, v10;
+ int32x4_t u11, v11;
+ int32x4_t u12, v12;
+ int32x4_t u13, v13;
+ int32x4_t u14, v14;
+ int32x4_t u15, v15;
+
+ __builtin_prefetch (src + 256, 0, 3);
+
+ _ldpq (u15, v15, src + 480);
+ _stpq (u15, v15, dst + 480);
+
+ _ldpq (u14, v14, src + 448);
+ _stpq (u14, v14, dst + 448);
+
+ _ldpq (u13, v13, src + 416);
+ _stpq (u13, v13, dst + 416);
+
+ _ldpq (u12, v12, src + 384);
+ _stpq (u12, v12, dst + 384);
+
+ _ldpq (u11, v11, src + 352);
+ _stpq (u11, v11, dst + 352);
+
+ _ldpq (u10, v10, src + 320);
+ _stpq (u10, v10, dst + 320);
+
+ _ldpq (u9, v9, src + 288);
+ _stpq (u9, v9, dst + 288);
+
+ _ldpq (u8, v8, src + 256 );
+ _stpq (u8, v8, dst + 256);
+
+ _ldpq (u7, v7, src + 224);
+ _stpq (u7, v7, dst + 224);
+
+ _ldpq (u6, v6, src + 192);
+ _stpq (u6, v6, dst + 192);
+
+ _ldpq (u5, v5, src + 160);
+ _stpq (u5, v5, dst + 160);
+
+ _ldpq (u4, v4, src + 128);
+ _stpq (u4, v4, dst + 128);
+
+ _ldpq (u3, v3, src + 96);
+ _stpq (u3, v3, dst + 96);
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+static inline void 
+memmove_upto_16bytes (void * dst, void * src, size_t bytes)
+{
+ if (bytes == 16)
+   {
+     memcpy_16bytes (dst, src);
+     return;
+   }
+
+ size_t rem = bytes;
+ size_t bc = 0;
+ size_t index;
+
+ if (bytes >= 8)
+     goto L_mm_8;
+
+ if (bytes == 4)
+   {
+     memcpy_4bytes (dst, src);
+     return;
+   }
+
+ if (bytes == 1 )
+   {
+     ((char *)dst)[0] = ((char *)src)[0]; 
+     return;
+   }
+
+ if (bytes == 2 )
+   {
+     ((char *)dst)[1] = ((char *)src)[1]; 
+     ((char *)dst)[0] = ((char *)src)[0]; 
+     return;
+   }
+
+ if (bytes == 3 )
+   {
+     ((char *)dst)[2] = ((char *)src)[2]; 
+     ((char *)dst)[1] = ((char *)src)[1]; 
+     ((char *)dst)[0] = ((char *)src)[0]; 
+     return;
+   }
+
+ goto L_mm_4;
+
+ L_mm_8:
+  index = bytes - (bc + 8);
+  memcpy_8bytes (dst + index, src + index);
+
+  rem -= 8;
+
+  if (rem == 0)
+    return;
+
+  bc += 8;
+
+ L_mm_4:
+  index = bytes - (bc + 4);
+  memcpy_4bytes (dst + index, src + index);
+
+  rem -= 4;
+
+  if (rem == 0)
+    return;
+
+  bc += 4;
+
+ /* there are now less than 4 bytes left to copy */
+ switch (rem)
+ {
+   case 3:
+           ((char *)dst)[2] = ((char *)src)[2]; 
+   case 2:
+           ((char *)dst)[1] = ((char *)src)[1]; 
+   case 1:
+           ((char *)dst)[0] = ((char *)src)[0]; 
+ }
+} // memmove_upto_16bytes
+
+
+static inline bool 
+memmove_small (void * dst, void * src, size_t bytes)
+{
+ /* no checking for overlapping src and dst is needed,
+    as loads and stores are done separately
+ */
+ if (bytes <= 16)
+   {
+     memmove_upto_16bytes (dst, src, bytes);
+     return true;
+   }
+
+ if (bytes & 0x1f)
+   return false; /* there will be no match in switch table below, if not divisible by 32 */
+
+ switch (bytes)
+ {
+   case 32:
+            memmove_32byte (dst, src, bytes);
+            return true;
+   case 64:
+            memmove_64byte (dst, src, bytes);
+            return true;
+   case 96:
+            memmove_96byte (dst, src, bytes);
+            return true;
+   case 128:
+            memmove_128byte (dst, src, bytes);
+            return true;
+   case 256:
+            memmove_256byte (dst, src, bytes);
+            return true;
+   case 512:
+            memmove_512byte (dst, src, bytes);
+            return true;
+   default:
+            return false;
+ }
+}
+
+
+void
+__memmove_thunderx2 (void * dst, void * src, size_t bytes)
+{
+ if (dst == src || bytes == 0)
+   return;
+
+ __builtin_prefetch (src, 0, 3);
+
+ if (memmove_small (dst, src, bytes))
+   return;
+
+ uint64_t offset = (dst > src) ? (dst - src) : (src - dst);
+
+ if (bytes <= offset)
+   {
+     /* overlap does not matter */
+     int_simd_memcpy (dst, src, bytes);
+     return;
+   }
+
+ /* take care of overlap below */
+
+ if (dst > src)
+    int_simd_memcpy (dst + offset, src + (offset << 1), bytes - offset);
+ else
+    int_simd_memcpy (dst + offset, src + offset, bytes - offset);
+
+ int_simd_memcpy (dst, src, offset);
+
+} // end function: simd_memmove
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 34c6b29..5692230 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,10 +29,15 @@ 
 extern __typeof (__redirect_memmove) __libc_memmove;
 
 extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
 
 libc_ifunc (__libc_memmove,
-            IS_THUNDERX (midr) ? __memmove_thunderx : __memmove_generic);
+            IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+            ? __memmove_thunderx2 :
+               IS_THUNDERX (midr)
+               ? __memmove_thunderx
+            : __memmove_generic);
 
 # undef memmove
 strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index c92b650..45ddce6 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -40,6 +40,10 @@ 
 
 #define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C'	\
 			   && MIDR_PARTNUM(midr) == 0x0a1)
+#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B'     \
+                           && MIDR_PARTNUM(midr) == 0x516)
+#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C'       \
+                           && MIDR_PARTNUM(midr) == 0xaf)