@@ -1,5 +1,21 @@
2014-04-04 Ling Ma <ling.ml@alibaba-inc.com>
+ * sysdeps/x86_64/multiarch/Makefile: Add avx memcpy/mempcpy/memmove
+ * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add support for related
+ flies with avx memcpy
+ * sysdeps/x86_64/multiarch/memcpy.S: Add support for avx memcpy
+ * sysdeps/x86_64/multiarch/memcpy_chk.S: Add support for avx memcpy_chk
+ * sysdeps/x86_64/multiarch/memmove.c: Add support for avx memmove
+ * sysdeps/x86_64/multiarch/memmove_chk.c: Add support for avx memmove_chk
+ * sysdeps/x86_64/multiarch/mempcpy.S: Add support for avx mempcpy
+ * sysdeps/x86_64/multiarch/mempcpy_chk.S: Add support for avx mempcpy_chk
+ * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: New file for avx memcpy
+ * sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S: New file for avx mempcpy
+ * sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: New file for avx
+ memmove
+
+2014-04-04 Ling Ma <ling.ml@alibaba-inc.com>
+
* sysdeps/x86_64/multiarch/Makefile: Add memset-avx2
* sysdeps/x86_64/multiarch/memset-avx2.S: New file for AVX2 memset
* sysdeps/x86_64/multiarch/memset.S: New file for multiple memset
@@ -11,6 +11,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcmp-sse4 memcpy-ssse3 \
memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+ memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
memmove-ssse3-back strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
IFUNC_IMPL (i, name, __memmove_chk,
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+ __memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memmove.S. */
IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+ __memmove_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
@@ -201,6 +205,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+ __memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -210,6 +216,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memcpy.S. */
IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+ __memcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
@@ -218,6 +226,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
IFUNC_IMPL (i, name, __mempcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+ __mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
@@ -227,6 +237,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
IFUNC_IMPL (i, name, mempcpy,
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+ __mempcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
new file mode 100644
@@ -0,0 +1,390 @@
+/* memcpy with AVX
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef MEMCPY
+# define MEMCPY __memcpy_avx_unaligned
+# define MEMCPY_CHK __memcpy_chk_avx_unaligned
+#endif
+
+ .section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+#ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+#endif
+ cmp $256, %rdx
+ jae L(256bytesormore)
+ cmp $128, %dl
+ jb L(less_128bytes)
+ vmovups (%rsi), %xmm0
+ lea (%rsi, %rdx), %rcx
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ vmovups 0x30(%rsi), %xmm3
+ vmovups 0x40(%rsi), %xmm4
+ vmovups 0x50(%rsi), %xmm5
+ vmovups 0x60(%rsi), %xmm6
+ vmovups 0x70(%rsi), %xmm7
+ vmovups -0x80(%rcx), %xmm8
+ vmovups -0x70(%rcx), %xmm9
+ vmovups -0x60(%rcx), %xmm10
+ vmovups -0x50(%rcx), %xmm11
+ vmovups -0x40(%rcx), %xmm12
+ vmovups -0x30(%rcx), %xmm13
+ vmovups -0x20(%rcx), %xmm14
+ vmovups -0x10(%rcx), %xmm15
+ lea (%rdi, %rdx), %rdx
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm2, 0x20(%rdi)
+ vmovups %xmm3, 0x30(%rdi)
+ vmovups %xmm4, 0x40(%rdi)
+ vmovups %xmm5, 0x50(%rdi)
+ vmovups %xmm6, 0x60(%rdi)
+ vmovups %xmm7, 0x70(%rdi)
+ vmovups %xmm8, -0x80(%rdx)
+ vmovups %xmm9, -0x70(%rdx)
+ vmovups %xmm10, -0x60(%rdx)
+ vmovups %xmm11, -0x50(%rdx)
+ vmovups %xmm12, -0x40(%rdx)
+ vmovups %xmm13, -0x30(%rdx)
+ vmovups %xmm14, -0x20(%rdx)
+ vmovups %xmm15, -0x10(%rdx)
+ ret
+ .p2align 4
+L(less_128bytes):
+ cmp $64, %dl
+ jb L(less_64bytes)
+ vmovups (%rsi), %xmm0
+ lea (%rsi, %rdx), %rcx
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ lea (%rdi, %rdx), %rdx
+ vmovups 0x30(%rsi), %xmm3
+ vmovups -0x40(%rcx), %xmm4
+ vmovups -0x30(%rcx), %xmm5
+ vmovups -0x20(%rcx), %xmm6
+ vmovups -0x10(%rcx), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm2, 0x20(%rdi)
+ vmovups %xmm3, 0x30(%rdi)
+ vmovups %xmm4, -0x40(%rdx)
+ vmovups %xmm5, -0x30(%rdx)
+ vmovups %xmm6, -0x20(%rdx)
+ vmovups %xmm7, -0x10(%rdx)
+ ret
+ .p2align 4
+L(less_64bytes):
+ cmp $32, %dl
+ jb L(less_32bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups -0x20(%rsi, %rdx), %xmm6
+ vmovups -0x10(%rsi, %rdx), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm6, -0x20(%rdi, %rdx)
+ vmovups %xmm7, -0x10(%rdi, %rdx)
+ ret
+ .p2align 4
+L(less_32bytes):
+ cmp $16, %dl
+ jb L(less_16bytes)
+ vmovups (%rsi), %xmm0
+ vmovups -0x10(%rsi, %rdx), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm7, -0x10(%rdi, %rdx)
+ ret
+ .p2align 4
+L(less_16bytes):
+ cmp $8, %dl
+ jb L(less_8bytes)
+ movq -0x08(%rsi, %rdx), %rcx
+ movq (%rsi), %rsi
+ movq %rsi, (%rdi)
+ movq %rcx, -0x08(%rdi, %rdx)
+ ret
+ .p2align 4
+L(less_8bytes):
+ cmp $4, %dl
+ jb L(less_4bytes)
+ mov -0x04(%rsi, %rdx), %ecx
+ mov (%rsi), %esi
+ mov %esi, (%rdi)
+ mov %ecx, -0x04(%rdi, %rdx)
+ ret
+L(less_4bytes):
+ cmp $1, %dl
+ jbe L(less_2bytes)
+ mov -0x02(%rsi, %rdx), %cx
+ mov (%rsi), %si
+ mov %si, (%rdi)
+ mov %cx, -0x02(%rdi, %rdx)
+ ret
+L(less_2bytes):
+ jb L(less_0bytes)
+ mov (%rsi), %cl
+ mov %cl, (%rdi)
+L(less_0bytes):
+ ret
+
+ .p2align 4
+L(256bytesormore):
+#ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ jae L(copy_backward)
+#endif
+ cmp $2048, %rdx
+ jae L(gobble_data_movsb)
+ mov %rax, %r8
+ lea (%rsi, %rdx), %rcx
+ mov %rdi, %r10
+ vmovups -0x80(%rcx), %xmm5
+ vmovups -0x70(%rcx), %xmm6
+ mov $0x80, %rax
+ and $-32, %rdi
+ add $32, %rdi
+ vmovups -0x60(%rcx), %xmm7
+ vmovups -0x50(%rcx), %xmm8
+ mov %rdi, %r11
+ sub %r10, %r11
+ vmovups -0x40(%rcx), %xmm9
+ vmovups -0x30(%rcx), %xmm10
+ sub %r11, %rdx
+ vmovups -0x20(%rcx), %xmm11
+ vmovups -0x10(%rcx), %xmm12
+ vmovups (%rsi), %ymm4
+ add %r11, %rsi
+ sub %eax, %edx
+L(goble_128_loop):
+ vmovups (%rsi), %ymm0
+ vmovups 0x20(%rsi), %ymm1
+ vmovups 0x40(%rsi), %ymm2
+ vmovups 0x60(%rsi), %ymm3
+ add %rax, %rsi
+ vmovaps %ymm0, (%rdi)
+ vmovaps %ymm1, 0x20(%rdi)
+ vmovaps %ymm2, 0x40(%rdi)
+ vmovaps %ymm3, 0x60(%rdi)
+ add %rax, %rdi
+ sub %eax, %edx
+ jae L(goble_128_loop)
+ add %eax, %edx
+ add %rdi, %rdx
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm5, -0x80(%rdx)
+ vmovups %xmm6, -0x70(%rdx)
+ vmovups %xmm7, -0x60(%rdx)
+ vmovups %xmm8, -0x50(%rdx)
+ vmovups %xmm9, -0x40(%rdx)
+ vmovups %xmm10, -0x30(%rdx)
+ vmovups %xmm11, -0x20(%rdx)
+ vmovups %xmm12, -0x10(%rdx)
+ mov %r8, %rax
+ ret
+
+ .p2align 4
+L(gobble_data_movsb):
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_shared_cache_size_half(%rip), %rcx
+#endif
+ shl $3, %rcx
+
+#ifdef USE_AS_MEMMOVE
+ mov %rsi, %r10
+ sub %rdi, %r10
+ cmp %rdx, %r10
+ jae L(memmove_use_memcpy_fwd)
+ cmp %rcx, %r10
+ jae L(memmove_use_memcpy_fwd)
+ jmp L(gobble_mem_fwd_llc_start)
+L(memmove_use_memcpy_fwd):
+#endif
+ cmp %rcx, %rdx
+ jae L(gobble_big_data_fwd)
+#ifdef USE_AS_MEMMOVE
+L(gobble_mem_fwd_llc_start):
+#endif
+ mov %rdx, %rcx
+ rep movsb
+ ret
+
+ .p2align 4
+L(gobble_big_data_fwd):
+ lea (%rsi, %rdx), %rcx
+ vmovups (%rsi), %ymm4
+ vmovups -0x80(%rsi,%rdx), %xmm5
+ vmovups -0x70(%rcx), %xmm6
+ vmovups -0x60(%rcx), %xmm7
+ vmovups -0x50(%rcx), %xmm8
+ vmovups -0x40(%rcx), %xmm9
+ vmovups -0x30(%rcx), %xmm10
+ vmovups -0x20(%rcx), %xmm11
+ vmovups -0x10(%rcx), %xmm12
+ mov %rdi, %r8
+ and $-32, %rdi
+ add $32, %rdi
+ mov %rdi, %r10
+ sub %r8, %r10
+ sub %r10, %rdx
+ add %r10, %rsi
+ lea (%rdi, %rdx), %rcx
+ sub $0x80, %rdx
+L(gobble_mem_fwd_loop):
+ prefetchnta 0x1c0(%rsi)
+ prefetchnta 0x280(%rsi)
+ vmovups (%rsi), %ymm0
+ vmovups 0x20(%rsi), %ymm1
+ vmovups 0x40(%rsi), %ymm2
+ vmovups 0x60(%rsi), %ymm3
+ lea 0x80(%rsi), %rsi
+ vmovntdq %ymm0, (%rdi)
+ vmovntdq %ymm1, 0x20(%rdi)
+ vmovntdq %ymm2, 0x40(%rdi)
+ vmovntdq %ymm3, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_fwd_loop)
+ sfence
+ vmovups %ymm4, (%r8)
+ vzeroupper
+ vmovups %xmm5, -0x80(%rcx)
+ vmovups %xmm6, -0x70(%rcx)
+ vmovups %xmm7, -0x60(%rcx)
+ vmovups %xmm8, -0x50(%rcx)
+ vmovups %xmm9, -0x40(%rcx)
+ vmovups %xmm10, -0x30(%rcx)
+ vmovups %xmm11, -0x20(%rcx)
+ vmovups %xmm12, -0x10(%rcx)
+ ret
+
+#ifdef USE_AS_MEMMOVE
+ .p2align 4
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_shared_cache_size_half(%rip), %rcx
+#endif
+ shl $3, %rcx
+ mov %rdi, %r9
+ vmovups (%rsi), %xmm8
+ vmovups 0x10(%rsi), %xmm9
+ add %rdx, %rdi
+ vmovups 0x20(%rsi), %xmm10
+ vmovups 0x30(%rsi), %xmm11
+ lea -0x20(%rdi), %r10
+ mov %rdi, %r11
+ vmovups 0x40(%rsi), %xmm12
+ vmovups 0x50(%rsi), %xmm13
+ and $0x1f, %r11
+ vmovups 0x60(%rsi), %xmm14
+ vmovups 0x70(%rsi), %xmm15
+ xor %r11, %rdi
+ add %rdx, %rsi
+ vmovups -0x20(%rsi), %ymm4
+ sub %r11, %rsi
+ sub %r11, %rdx
+ mov %rdi, %r11
+ sub %rsi, %r11
+ cmp %rdx, %r11
+ jae L(memmove_use_memcpy_bwd)
+ cmp %rcx, %r11
+ jae L(memmove_use_memcpy_bwd)
+ jmp L(gobble_mem_bwd_llc_start)
+L(memmove_use_memcpy_bwd):
+ cmp %rcx, %rdx
+ ja L(gobble_big_data_bwd)
+L(gobble_mem_bwd_llc_start):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_llc):
+ vmovups -0x20(%rsi), %ymm0
+ vmovups -0x40(%rsi), %ymm1
+ vmovups -0x60(%rsi), %ymm2
+ vmovups -0x80(%rsi), %ymm3
+ lea -0x80(%rsi), %rsi
+ vmovaps %ymm0, -0x20(%rdi)
+ vmovaps %ymm1, -0x40(%rdi)
+ vmovaps %ymm2, -0x60(%rdi)
+ vmovaps %ymm3, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_bwd_llc)
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, (%r9)
+ vmovups %xmm9, 0x10(%r9)
+ vmovups %xmm10, 0x20(%r9)
+ vmovups %xmm11, 0x30(%r9)
+ vmovups %xmm12, 0x40(%r9)
+ vmovups %xmm13, 0x50(%r9)
+ vmovups %xmm14, 0x60(%r9)
+ vmovups %xmm15, 0x70(%r9)
+ ret
+L(gobble_big_data_bwd):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_loop):
+ prefetchnta -0x1c0(%rsi)
+ prefetchnta -0x280(%rsi)
+ vmovups -0x20(%rsi), %ymm0
+ vmovups -0x40(%rsi), %ymm1
+ vmovups -0x60(%rsi), %ymm2
+ vmovups -0x80(%rsi), %ymm3
+ lea -0x80(%rsi), %rsi
+ vmovntdq %ymm0, -0x20(%rdi)
+ vmovntdq %ymm1, -0x40(%rdi)
+ vmovntdq %ymm2, -0x60(%rdi)
+ vmovntdq %ymm3, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_bwd_loop)
+ sfence
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, (%r9)
+ vmovups %xmm9, 0x10(%r9)
+ vmovups %xmm10, 0x20(%r9)
+ vmovups %xmm11, 0x30(%r9)
+ vmovups %xmm12, 0x40(%r9)
+ vmovups %xmm13, 0x50(%r9)
+ vmovups %xmm14, 0x60(%r9)
+ vmovups %xmm15, 0x70(%r9)
+ ret
+#endif
+END (MEMCPY)
+#endif
@@ -32,6 +32,10 @@ ENTRY(__new_memcpy)
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
+1: leaq __memcpy_avx_unaligned(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jz 1f
+ ret
1: leaq __memcpy_sse2(%rip), %rax
testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
jnz 2f
@@ -39,6 +39,9 @@ ENTRY(__memcpy_chk)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __memcpy_chk_ssse3_back(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jz 2f
+ leaq __memcpy_chk_avx_unaligned(%rip), %rax
2: ret
END(__memcpy_chk)
# else
new file mode 100644
@@ -0,0 +1,22 @@
+/* memmove with AVX
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_avx_unaligned
+#define MEMCPY_CHK __memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
@@ -35,6 +35,8 @@
extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
+
#endif
#include "string/memmove.c"
@@ -47,10 +49,11 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
ifunc symbol properly. */
extern __typeof (__redirect_memmove) __libc_memmove;
libc_ifunc (__libc_memmove,
- HAS_SSSE3
+ HAS_AVX ? __memmove_avx_unaligned :
+ (HAS_SSSE3
? (HAS_FAST_COPY_BACKWARD
? __memmove_ssse3_back : __memmove_ssse3)
- : __memmove_sse2)
+ : __memmove_sse2));
strong_alias (__libc_memmove, memmove)
@@ -25,11 +25,13 @@
extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
+extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
#include "debug/memmove_chk.c"
libc_ifunc (__memmove_chk,
- HAS_SSSE3
+ HAS_AVX ? __memmove_chk_avx_unaligned :
+ (HAS_SSSE3
? (HAS_FAST_COPY_BACKWARD
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
- : __memmove_chk_sse2);
+ : __memmove_chk_sse2));
new file mode 100644
@@ -0,0 +1,22 @@
+/* mempcpy with AVX
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_avx_unaligned
+#define MEMCPY_CHK __mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
@@ -37,6 +37,9 @@ ENTRY(__mempcpy)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __mempcpy_ssse3_back(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jz 2f
+ leaq __mempcpy_avx_unaligned(%rip), %rax
2: ret
END(__mempcpy)
@@ -39,6 +39,9 @@ ENTRY(__mempcpy_chk)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __mempcpy_chk_ssse3_back(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jz 2f
+ leaq __mempcpy_chk_avx_unaligned(%rip), %rax
2: ret
END(__mempcpy_chk)
# else