@@ -1,6 +1,6 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
- memcpy_falkor memmove_falkor \
+ memcpy_falkor memcpy_kunpeng memmove_falkor \
memset_generic memset_falkor memset_emag \
memchr_generic memchr_nosimd \
strlen_generic strlen_asimd
@@ -25,7 +25,7 @@
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 4
+#define MAX_IFUNC 5
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -42,11 +42,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_kunpeng)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_kunpeng)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
IFUNC_IMPL (i, name, memset,
/* Enable this on non-falkor processors too so that other cores
@@ -32,15 +32,18 @@ extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_kunpeng attribute_hidden;
libc_ifunc (__libc_memcpy,
- (IS_THUNDERX (midr)
- ? __memcpy_thunderx
- : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr)
- ? __memcpy_falkor
- : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
- ? __memcpy_thunderx2
- : __memcpy_generic))));
+ (IS_KUNPENG(midr)
+ ?__memcpy_kunpeng
+ : (IS_THUNDERX (midr)
+ ? __memcpy_thunderx
+ : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr)
+ ? __memcpy_falkor
+ : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memcpy_thunderx2
+ : __memcpy_generic)))));
# undef memcpy
strong_alias (__libc_memcpy, memcpy);
new file mode 100644
@@ -0,0 +1,445 @@
+/* Optimized memcpy and memmove for Huawei Kunpeng processor.
+ Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define G_l count
+#define G_h dst
+#define tmp1 x14
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+#define I_q q16
+#define J_q q17
+
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+#undef MEMCPY
+#define MEMCPY __memcpy_kunpeng
+#undef MEMMOVE
+#define MEMMOVE __memmove_kunpeng
+
+
+/* Overlapping large forward memmoves use a loop that copies backwards.
+ Otherwise memcpy is used. Small moves branch to memcopy16 directly.
+ The longer memcpy cases fall through to the memcpy head.
+*/
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ sub tmp1, dstin, src
+ cmp count, 512
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_long)
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_middle)
+
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+
+
+/* Copies are split into 4 main cases: small copies of up to 16 bytes,
+ medium copies of 17..96 bytes which are fully unrolled. Long copies
+ of 97..1024 align dst address without prefetching. Large copies
+ of more than 1024 bytes align the destination with prefetching.
+*/
+
+#define MEMCPY_PREFETCH_LDR 640
+
+ .p2align 4
+ENTRY (MEMCPY)
+
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ add srcend, src, count
+ cmp count, 16
+ b.ls L(memcopy16)
+ add dstend, dstin, count
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ ldr A_q, [src], #16
+ and tmp1, src, 15
+ ldr E_q, [srcend, -16]
+ cmp count, 64
+ b.gt L(memcpy_copy96)
+ cmp count, 48
+ b.le L(bytes_17_to_48)
+ /* 49..64 bytes */
+ ldp B_q, C_q, [src]
+ str E_q, [dstend, -16]
+ stp A_q, B_q, [dstin]
+ str C_q, [dstin, 32]
+ ret
+
+L(bytes_17_to_48):
+ /* 17..48 bytes*/
+ cmp count, 32
+ b.gt L(bytes_32_to_48)
+ /* 17..32 bytes*/
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
+ ret
+
+L(bytes_32_to_48):
+ /* 32..48 */
+ ldr B_q, [src]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
+ str B_q, [dstin, 16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(memcopy16):
+ cmp count, 8
+ b.lo L(bytes_0_to_8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ add dstend, dstin, count
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+
+L(bytes_0_to_8):
+ tbz count, 2, L(bytes_0_to_3)
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ add dstend, dstin, count
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
+L(bytes_0_to_3):
+ cbz count, 1f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ add dstend, dstin, count
+ ldrb B_lw, [src, tmp1]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+ strb A_lw, [dstin]
+1:
+ ret
+
+ .p2align 4
+L(memcpy_copy96):
+ /* Copying 65..96 bytes. A_q (first 16 bytes) and
+ E_q(last 16 bytes) are already loaded. The size
+ is large enough to benefit from aligned loads */
+ bic src, src, 15
+ ldp B_q, C_q, [src]
+ /* Loaded 64 bytes, second 16-bytes chunk can be
+ overlapping with the first chunk by tmp1 bytes.
+ Stored 16 bytes. */
+ sub dst, dstin, tmp1
+ add count, count, tmp1
+ /* The range of count being [65..96] becomes [65..111]
+ after tmp [0..15] gets added to it,
+ count now is <bytes-left-to-load>+48 */
+ cmp count, 80
+ b.gt L(copy96_medium)
+ ldr D_q, [src, 32]
+ stp B_q, C_q, [dst, 16]
+ str D_q, [dst, 48]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
+ ret
+
+ .p2align 4
+L(copy96_medium):
+ ldp D_q, G_q, [src, 32]
+ cmp count, 96
+ b.gt L(copy96_large)
+ stp B_q, C_q, [dst, 16]
+ stp D_q, G_q, [dst, 48]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
+ ret
+
+L(copy96_large):
+ ldr F_q, [src, 64]
+ str B_q, [dst, 16]
+ stp C_q, D_q, [dst, 32]
+ stp G_q, F_q, [dst, 64]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
+ ret
+
+ /* long copies: 96..1024 bytes */
+ .p2align 4
+L(copy_long):
+ cmp count, 1024
+ b.gt L(copy_large)
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(last64)
+
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the end even if
+ there is just 1 byte left. */
+L(last64):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ /* large copies: more than 1024 bytes */
+ .p2align 4
+L(copy_large):
+ ldr A_q, [src]
+ and tmp1, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1
+ add src, src, 16
+ ldp B_q, C_q, [src], #32
+ ldp D_q, E_q, [src], #32
+ str A_q, [dstin]
+ add dst, dstin, 16
+ bic dst, dst, 15
+
+ /* Already loaded 64+16 bytes. Check if at
+ least 64 more bytes left */
+ sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 +128 + 16
+
+ .p2align 4
+L(loop128_prefetch):
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp F_q, G_q, [src], #32
+ stp B_q, C_q, [dst], #32
+ ldp H_q, I_q, [src], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp B_q, C_q, [src], #32
+ stp D_q, E_q, [dst], #32
+ ldp D_q, E_q, [src], #32
+ stp F_q, G_q, [dst], #32
+ stp H_q, I_q, [dst], #32
+ subs count, count, 128
+ b.ge L(loop128_prefetch)
+
+ add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+ .p2align 4
+L(loop128):
+ stp B_q, C_q, [dst], #32
+ ldp F_q, G_q, [src], #32
+ stp D_q, E_q, [dst], #32
+ ldp H_q, I_q, [src], #32
+ subs count, count, 64
+ b.lt L(loop128_exit1)
+ stp F_q, G_q, [dst], #32
+ ldp B_q, C_q, [src], #32
+ stp H_q, I_q, [dst], #32
+ ldp D_q, E_q, [src], #32
+ subs count, count, 64
+ b.ge L(loop128)
+L(loop128_exit0):
+ stp B_q, C_q, [dst], #32
+ ldp F_q, G_q, [srcend, -64]
+ stp D_q, E_q, [dst]
+ ldp H_q, I_q, [srcend, -32]
+ stp F_q, G_q, [dstend, -64]
+ stp H_q, I_q, [dstend, -32]
+ ret
+L(loop128_exit1):
+ stp H_q, I_q, [dst]
+ ldp B_q, C_q, [srcend, -64]
+ stp F_q, G_q, [dst], #32
+ ldp D_q, E_q, [srcend, -32]
+ stp B_q, C_q, [dstend, -64]
+ stp D_q, E_q, [dstend, -32]
+ ret
+
+ /* long move: more than 512 bytes align the dstend */
+ .p2align 4
+L(move_long):
+1:
+ cbz tmp1, 3f
+ add srcend, src, count
+ add dstend, dstin, count
+
+ and tmp1, dstend, 15
+ ldr D_q, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+
+1:
+ subs count, count, 64
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -64]!
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldp E_q, F_q, [src, 32]
+ ldp G_q, H_q, [src]
+ stp A_q, B_q, [dstend, -32]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp G_q, H_q, [dstin]
+3: ret
+
+ /* midlle move: 96..512 bytes */
+ .p2align 4
+L(move_middle):
+ cbz tmp1, 3f
+ add srcend, src, count
+ prfm PLDL1STRM, [srcend, -64]
+ add dstend, dstin, count
+ and tmp1, dstend, 15
+ ldr D_q, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldr A_q, [srcend, -16]
+ str D_q, [dstend, -16]
+ ldr B_q, [srcend, -32]
+ ldr C_q, [srcend, -48]
+ ldr D_q, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+
+1:
+ str A_q, [dstend, -16]
+ ldr A_q, [srcend, -16]
+ str B_q, [dstend, -32]
+ ldr B_q, [srcend, -32]
+ str C_q, [dstend, -48]
+ ldr C_q, [srcend, -48]
+ str D_q, [dstend, -64]!
+ ldr D_q, [srcend, -64]!
+ subs count, count, 64
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldr G_q, [src, 48]
+ str A_q, [dstend, -16]
+ ldr A_q, [src, 32]
+ str B_q, [dstend, -32]
+ ldr B_q, [src, 16]
+ str C_q, [dstend, -48]
+ ldr C_q, [src]
+ str D_q, [dstend, -64]
+ str G_q, [dstin, 48]
+ str A_q, [dstin, 32]
+ str B_q, [dstin, 16]
+ str C_q, [dstin]
+3: ret
+
+
+END (MEMCPY)
+ .section .rodata
+ .p2align 4
+
+libc_hidden_builtin_def (MEMCPY)
+#endif
\ No newline at end of file
@@ -32,15 +32,18 @@ extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_kunpeng attribute_hidden;
libc_ifunc (__libc_memmove,
- (IS_THUNDERX (midr)
- ? __memmove_thunderx
- : (IS_FALKOR (midr) || IS_PHECDA (midr)
- ? __memmove_falkor
- : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
- ? __memmove_thunderx2
- : __memmove_generic))));
+ (IS_KUNPENG(midr)
+ ?__memmove_kunpeng
+ : (IS_THUNDERX (midr)
+ ? __memmove_thunderx
+ : (IS_FALKOR (midr) || IS_PHECDA (midr)
+ ? __memmove_falkor
+ : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memmove_thunderx2
+ : __memmove_generic)))));
# undef memmove
strong_alias (__libc_memmove, memmove);