========================================================================================================================
@@ -1,3 +1,4 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2
+CFLAGS-memcpy_thunderx2.c += -O3 -funroll-loops
endif
@@ -25,11 +25,11 @@
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 2
+#define MAX_IFUNC 3
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- size_t max)
+ size_t max)
{
assert (max >= MAX_IFUNC);
@@ -39,13 +39,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
IFUNC_IMPL (i, name, memcpy,
- IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
- __memcpy_thunderx)
- IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
- IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX (midr),
- __memmove_thunderx)
- IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
return i;
}
@@ -29,10 +29,15 @@
extern __typeof (__redirect_memcpy) __libc_memcpy;
extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
libc_ifunc (__libc_memcpy,
- IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic);
+ IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memcpy_thunderx2 :
+ IS_THUNDERX (midr)
+ ? __memcpy_thunderx
+ : __memcpy_generic);
# undef memcpy
strong_alias (__libc_memcpy, memcpy);
@@ -0,0 +1,380 @@
+/* Multiple versions of memcpy and memmove. AARCH64 version.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <arm_neon.h>
+
+extern void *
+simd_memcpy (void * dst, const void * src, size_t bytes);
+
+extern void *
+simd_memmove (void * dst, const void * src, size_t bytes);
+
+#define _ldpq(x, y, p) \
+ asm ("ldp %q0, %q1, [%2]" : "=w" ((x)), "=w" ((y)) : "r" ((p)))
+
+#define _stpq(x, y, p) \
+ asm ("stp %q0, %q1, [%2]" :: "w" ((x)), "w" ((y)), "r" ((p)) : "memory")
+
+#define _ldpr(x, y, p) \
+ asm ("ldp %x0, %x1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stpr(x, y, p) \
+ asm ("stp %x0, %x1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ldp(x, y, p) \
+ asm ("ldp %w0, %w1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stp(x, y, p) \
+ asm ("stp %w0, %w1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ld(x, p) \
+ asm ("ldr %w0, [%1]" : "=r" ((x)) : "r" ((p)))
+
+#define _st(x, p) \
+ asm ("str %w0, [%1]" :: "r" ((x)), "r" ((p)) : "memory")
+
+
+/* Copy 64 bytes using ldp/stp */
+static inline void
+memcpy_64bytes (void * dst, const void * src)
+{
+ int32x4_t u0, v0, u1, v1;
+ _ldpq (u0, v0, src);
+ _stpq (u0, v0, dst);
+ _ldpq (u1, v1, (src + 32));
+ _stpq (u1, v1, (dst + 32));
+}
+
+/* Copy 32 bytes using ldp/stp. */
+static inline void
+memcpy_32bytes (void * dst, const void * src)
+{
+ int32x4_t u, v;
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+/* Copy 16bytes using ldr/str. */
+static inline void
+memcpy_16bytes (void * dst, const void * src)
+{
+ vst1q_s32 (dst, vld1q_s32(src));
+}
+
+/* Copy 8bytes using ldp/stp. */
+static inline void
+memcpy_8bytes (void * dst, const void * src)
+{
+ int32_t u, v;
+ _ldp (u, v, src);
+ _stp (u, v, dst);
+}
+
+/* Copy 4bytes using ldr/src */
+static inline void
+memcpy_4bytes (void * dst, const void * src)
+{
+ int32_t u;
+ _ld (u, src);
+ _st (u, dst);
+}
+
+static inline
+void copy_fwd_bytes (char * dst, const char * src, size_t sz)
+{
+ while (sz--)
+ *dst++ = *src++;
+}
+
+static inline
+void copy_reverse_bytes (char * dst, const char *src, size_t sz)
+{
+ dst += sz;
+ src += sz;
+ while (sz--)
+ *--dst = *--src;
+}
+
+/* Copy up to 16 bytes */
+static inline void
+memcpy_upto_16bytes (void * dst, const void * src, size_t bytes)
+{
+ if (bytes == 16)
+ {
+ memcpy_16bytes (dst, src);
+ return;
+ }
+ size_t cnt = bytes;
+ if (cnt >= 8)
+ {
+ memcpy_8bytes (dst, src);
+ cnt -= 8;
+ if (cnt == 0)
+ return;
+ memcpy_8bytes (dst + (bytes - 8), src + (bytes - 8));
+ return;
+ }
+ if (cnt >= 4)
+ {
+ memcpy_4bytes (dst, src);
+ cnt -= 4;
+ if (cnt == 0)
+ return;
+ memcpy_4bytes (dst + (bytes - 4), src + (bytes - 4));
+ return;
+ }
+
+ size_t tmp = cnt >> 1;
+
+ *(char *)dst = *(char *)src;
+ *((char *)dst + tmp) = *((char *)src + tmp);
+ *((char *)dst + (bytes - 1)) = *((char *)src + (bytes -1));
+}
+
+static inline void
+memcpy_upto_96bytes (void * dst, const void * src, size_t bytes)
+{
+ /* Assumption: copy size is greater than 16 bytes */
+ size_t rem;
+ size_t i;
+
+ if (!(bytes & 31))
+ {
+ for (i=0; i < (bytes & ~31); i+=32)
+ memcpy_32bytes (dst + i, src + i);
+
+ return;
+ }
+
+ if (bytes > 32)
+ {
+ for (i=0; i < (bytes & ~31); i+=32)
+ memcpy_32bytes (dst + i, src + i);
+
+ memcpy_32bytes (dst + (bytes - 32), src + (bytes - 32));
+
+ return;
+ }
+
+ rem = bytes & 15;
+ for (i=0; i < (bytes & ~15); i+=16)
+ memcpy_16bytes (dst + i, src + i);
+
+ if (rem == 0)
+ return;
+
+ memcpy_16bytes (dst + (bytes - 16), src + (bytes - 16));
+}
+
+static inline void
+memcpy_long (void * dst, const void * src, size_t bytes)
+{
+ if (bytes >= 512 && ((size_t)src & 15))
+ {
+ size_t align_bytes = 16 - ((size_t)src & 15);
+ memcpy_16bytes (dst, src);
+ bytes -= align_bytes;
+ src += align_bytes;
+ dst += align_bytes;
+ }
+
+ size_t i;
+ size_t cnt = bytes & ~63;
+
+ for (i=0; i < cnt; i+=64)
+ memcpy_64bytes (dst + i, src + i);
+
+ if (bytes == cnt)
+ return;
+
+ memcpy_64bytes (dst + (bytes - 64), src + (bytes - 64));
+}
+
+static inline
+void * simd_copy_fwd (void * dst, const void * src, size_t sz)
+{
+
+ void * dst_orig = dst;
+ size_t rem = sz;
+ size_t cnt;
+
+ if (sz >= 512 && ((size_t)src & 15))
+ {
+ size_t align_bytes = 16 - ((size_t)src & 15);
+ copy_fwd_bytes (dst, src, align_bytes);
+ rem -= align_bytes;
+ src += align_bytes;
+ dst += align_bytes;
+ }
+
+ size_t N_16bytes = rem >> 4;
+
+ if (N_16bytes)
+ {
+ for (cnt = 0; cnt < (N_16bytes << 4); cnt+=16)
+ memcpy_16bytes (dst + cnt, src + cnt);
+
+ rem -= (N_16bytes << 4);
+ if (rem == 0)
+ return dst_orig;
+ dst += (N_16bytes << 4);
+ src += (N_16bytes << 4);
+ }
+
+ size_t N_8bytes = rem >> 3;
+
+ if (N_8bytes)
+ {
+ for (cnt = 0; cnt < (N_8bytes << 3); cnt+=8)
+ memcpy_8bytes (dst + cnt, src + cnt);
+
+ rem -= (N_8bytes << 3);
+ if (rem == 0)
+ return dst_orig;
+ dst += (N_8bytes << 3);
+ src += (N_8bytes << 3);
+ }
+
+ size_t N_4bytes = rem >> 2;
+
+ if (N_4bytes)
+ {
+ for (cnt = 0; cnt < (N_4bytes << 2); cnt+=4)
+ memcpy_4bytes (dst + cnt, src + cnt);
+
+ rem -= (N_4bytes << 2);
+ if (rem == 0)
+ return dst_orig;
+ dst += (N_4bytes << 2);
+ src += (N_4bytes << 2);
+ }
+
+ copy_fwd_bytes (dst, src, rem);
+ return dst_orig;
+}
+
+static inline
+void *
+simd_copy_bak (void * dst, const void * src, size_t sz)
+{
+ void * dst_orig = dst;
+
+ size_t rem = sz;
+ size_t j, cnt;
+
+ if (rem <= 4)
+ goto L_le_4b;
+
+
+ if (rem >> 4) /* there are at least 16 bytes to move */
+ {
+ if (sz >= 512)
+ {
+ size_t align_bytes = ((uint64_t)src + rem) & 15;
+ if (align_bytes)
+ {
+ size_t i;
+ char * dstend = dst + (rem - 1);
+ const char * srcend = src + (rem - 1);
+
+ for (i=0; i < align_bytes; i++)
+ *(dstend - i) = *(srcend - i);
+
+ rem -= align_bytes;
+ }
+ }
+
+ char * dstend = dst + rem;
+ const char * srcend = src + rem;
+
+ for (j=16, cnt=0; cnt < (rem & ~15); j+=16, cnt+=16)
+ memcpy_16bytes (dstend - j, srcend - j);
+
+ rem -= (rem & ~15);
+ if (rem == 0)
+ return dst_orig;
+ }
+
+ if (rem >> 3) /* there are at most 15 bytes to move */
+ {
+ memcpy_8bytes (dst + (rem - 8), src + (rem - 8));
+ rem -= 8;
+ if (rem == 0)
+ return dst_orig;
+ }
+
+L_le_4b:
+
+ if (rem >> 2) /* there are at most 7 bytes to move */
+ {
+ memcpy_4bytes (dst + (rem - 4), src + (rem - 4));
+ rem -= 4;
+ if (rem == 0)
+ return dst_orig;
+ }
+
+ /* copy less than 4 bytes */
+ copy_reverse_bytes (dst, src, rem);
+
+ return dst;
+}
+
+
+/* glibc memcpy function follows */
+void *
+__memcpy_thunderx2 (void * dst, const void * src, size_t bytes)
+{
+ if (dst == src || bytes == 0)
+ return dst;
+
+ if (bytes > 96)
+ {
+ memcpy_long (dst, src, bytes);
+ return dst;
+ }
+
+ if (bytes <= 16)
+ {
+ memcpy_upto_16bytes (dst, src, bytes);
+ return dst;
+ }
+
+ if (bytes <= 96)
+ {
+ memcpy_upto_96bytes (dst, src, bytes);
+ return dst;
+ }
+ return dst;
+}
+
+
+void *
+__memmove_thunderx2 (void * dst, const void * src, size_t bytes)
+{
+ if (dst == src || bytes == 0)
+ return dst;
+
+ if (src < dst)
+ return simd_copy_bak (dst, src, bytes);
+ else
+ return simd_copy_fwd (dst, src, bytes);
+}
@@ -29,10 +29,15 @@
extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
libc_ifunc (__libc_memmove,
- IS_THUNDERX (midr) ? __memmove_thunderx : __memmove_generic);
+ IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memmove_thunderx2 :
+ IS_THUNDERX (midr)
+ ? __memmove_thunderx
+ : __memmove_generic);
# undef memmove
strong_alias (__libc_memmove, memmove);
@@ -40,6 +40,10 @@
#define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
&& MIDR_PARTNUM(midr) == 0x0a1)
+#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B' \
+ && MIDR_PARTNUM(midr) == 0x516)
+#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
+ && MIDR_PARTNUM(midr) == 0xaf)