b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -1029,7 +1029,7 @@ Function: "cos_vlen4_avx2":
double: 2
Function: "cos_vlen8":
-double: 1
+double: 2
float: 1
Function: "cos_vlen8_avx2":
@@ -2125,7 +2125,7 @@ Function: "sincos_vlen4_avx2":
double: 2
Function: "sincos_vlen8":
-double: 1
+double: 2
float: 1
Function: "sincos_vlen8_avx2":
b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core_sse4.S
@@ -20,7 +20,7 @@
#include "svml_d_trig_data.h"
.text
-ENTRY (_ZGVbN2vvv_sincos_sse4)
+ENTRY (_ZGVbN2vl8l8_sincos_sse4)
/*
ALGORITHM DESCRIPTION:
@@ -311,4 +311,31 @@ ENTRY (_ZGVbN2vvv_sincos_sse4)
movsd %xmm0, 256(%rsp,%r15)
jmp .LBL_1_7
+END (_ZGVbN2vl8l8_sincos_sse4)
+libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4)
+
+/* vvv version implemented with wrapper to vl8l8 variant. */
+ENTRY (_ZGVbN2vvv_sincos_sse4)
+ subq $72, %rsp
+ .cfi_def_cfa_offset 80
+ movdqu %xmm1, 32(%rsp)
+ lea (%rsp), %rdi
+ movdqu %xmm2, 48(%rdi)
+ lea 16(%rsp), %rsi
+ call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
+ movq 32(%rsp), %rdx
+ movq 48(%rsp), %rsi
+ movq 40(%rsp), %r8
+ movq 56(%rsp), %r10
+ movq (%rsp), %rax
+ movq 16(%rsp), %rcx
+ movq 8(%rsp), %rdi
+ movq 24(%rsp), %r9
+ movq %rax, (%rdx)
+ movq %rcx, (%rsi)
+ movq %rdi, (%r8)
+ movq %r9, (%r10)
+ addq $72, %rsp
+ .cfi_def_cfa_offset 8
+ ret
END (_ZGVbN2vvv_sincos_sse4)
b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core_avx2.S
@@ -20,7 +20,7 @@
#include "svml_d_trig_data.h"
.text
-ENTRY (_ZGVdN4vvv_sincos_avx2)
+ENTRY (_ZGVdN4vl8l8_sincos_avx2)
/*
ALGORITHM DESCRIPTION:
@@ -274,4 +274,51 @@ ENTRY (_ZGVdN4vvv_sincos_avx2)
vmovsd %xmm0, 384(%rsp,%r15)
jmp .LBL_1_7
+END (_ZGVdN4vl8l8_sincos_avx2)
+libmvec_hidden_def(_ZGVdN4vl8l8_sincos_avx2)
+
+/* vvv version implemented with wrapper to vl8l8 variant. */
+ENTRY (_ZGVdN4vvv_sincos_avx2)
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-32, %rsp
+ subq $128, %rsp
+ vmovdqu %ymm1, 64(%rsp)
+ lea (%rsp), %rdi
+ vmovdqu %ymm2, 96(%rdi)
+ lea 32(%rsp), %rsi
+ call HIDDEN_JUMPTARGET(_ZGVdN4vl8l8_sincos_avx2)
+ movq 64(%rsp), %rdx
+ movq 96(%rsp), %rsi
+ movq 72(%rsp), %r8
+ movq 104(%rsp), %r10
+ movq (%rsp), %rax
+ movq 32(%rsp), %rcx
+ movq 8(%rsp), %rdi
+ movq 40(%rsp), %r9
+ movq %rax, (%rdx)
+ movq %rcx, (%rsi)
+ movq 80(%rsp), %rax
+ movq 112(%rsp), %rcx
+ movq %rdi, (%r8)
+ movq %r9, (%r10)
+ movq 88(%rsp), %rdi
+ movq 120(%rsp), %r9
+ movq 16(%rsp), %r11
+ movq 48(%rsp), %rdx
+ movq 24(%rsp), %rsi
+ movq 56(%rsp), %r8
+ movq %r11, (%rax)
+ movq %rdx, (%rcx)
+ movq %rsi, (%rdi)
+ movq %r8, (%r9)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
END (_ZGVdN4vvv_sincos_avx2)
b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -36,9 +36,9 @@
sin(R), sin(R') are approximated by corresponding polynomial. */
.text
-ENTRY (_ZGVeN8vvv_sincos_knl)
+ENTRY (_ZGVeN8vl8l8_sincos_knl)
#ifndef HAVE_AVX512_ASM_SUPPORT
-WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
#else
pushq %rbp
cfi_adjust_cfa_offset (8)
@@ -304,11 +304,12 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
jmp .LBL_1_7
#endif
-END (_ZGVeN8vvv_sincos_knl)
+END (_ZGVeN8vl8l8_sincos_knl)
+libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl)
-ENTRY (_ZGVeN8vvv_sincos_skx)
+ENTRY (_ZGVeN8vl8l8_sincos_skx)
#ifndef HAVE_AVX512_ASM_SUPPORT
-WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
#else
pushq %rbp
cfi_adjust_cfa_offset (8)
@@ -585,6 +586,100 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
jmp .LBL_2_7
#endif
+END (_ZGVeN8vl8l8_sincos_skx)
+libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx)
+
+/* Wrapper between vvv and vl8l8 vector variants. */
+.macro WRAPPER_AVX512_vvv_vl8l8 callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $256, %rsp
+ /* Encoding for vmovups %zmm1, 128(%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x4c
+ .byte 0x24
+ .byte 0x02
+ lea (%rsp), %rdi
+ /* Encoding for vmovups %zmm2, 192(%rdi). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x57
+ .byte 0x03
+ lea 64(%rsp), %rsi
+ call HIDDEN_JUMPTARGET(\callee)
+ movq 128(%rsp), %rdx
+ movq 136(%rsp), %rsi
+ movq 144(%rsp), %r8
+ movq 152(%rsp), %r10
+ movq (%rsp), %rax
+ movq 8(%rsp), %rcx
+ movq 16(%rsp), %rdi
+ movq 24(%rsp), %r9
+ movq %rax, (%rdx)
+ movq %rcx, (%rsi)
+ movq 160(%rsp), %rax
+ movq 168(%rsp), %rcx
+ movq %rdi, (%r8)
+ movq %r9, (%r10)
+ movq 176(%rsp), %rdi
+ movq 184(%rsp), %r9
+ movq 32(%rsp), %r11
+ movq 40(%rsp), %rdx
+ movq 48(%rsp), %rsi
+ movq 56(%rsp), %r8
+ movq %r11, (%rax)
+ movq %rdx, (%rcx)
+ movq 192(%rsp), %r11
+ movq 200(%rsp), %rdx
+ movq %rsi, (%rdi)
+ movq %r8, (%r9)
+ movq 208(%rsp), %rsi
+ movq 216(%rsp), %r8
+ movq 64(%rsp), %r10
+ movq 72(%rsp), %rax
+ movq 80(%rsp), %rcx
+ movq 88(%rsp), %rdi
+ movq %r10, (%r11)
+ movq %rax, (%rdx)
+ movq 224(%rsp), %r10
+ movq 232(%rsp), %rax
+ movq %rcx, (%rsi)
+ movq %rdi, (%r8)
+ movq 240(%rsp), %rcx
+ movq 248(%rsp), %rdi
+ movq 96(%rsp), %r9
+ movq 104(%rsp), %r11
+ movq 112(%rsp), %rdx
+ movq 120(%rsp), %rsi
+ movq %r9, (%r10)
+ movq %r11, (%rax)
+ movq %rdx, (%rcx)
+ movq %rsi, (%rdi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
+
+ENTRY (_ZGVeN8vvv_sincos_knl)
+WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl
+END (_ZGVeN8vvv_sincos_knl)
+
+ENTRY (_ZGVeN8vvv_sincos_skx)
+WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
END (_ZGVeN8vvv_sincos_skx)
.section .rodata, "a"
b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -49,9 +49,9 @@
R2 = XOR( RC, SC ). */
.text
-ENTRY (_ZGVeN16vvv_sincosf_knl)
+ENTRY (_ZGVeN16vl4l4_sincosf_knl)
#ifndef HAVE_AVX512_ASM_SUPPORT
-WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
#else
pushq %rbp
cfi_adjust_cfa_offset (8)
@@ -267,9 +267,10 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
vmovss %xmm0, 1280(%rsp,%r15,8)
jmp .LBL_1_7
#endif
-END (_ZGVeN16vvv_sincosf_knl)
+END (_ZGVeN16vl4l4_sincosf_knl)
+libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl)
-ENTRY (_ZGVeN16vvv_sincosf_skx)
+ENTRY (_ZGVeN16vl4l4_sincosf_skx)
#ifndef HAVE_AVX512_ASM_SUPPORT
WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
#else
@@ -496,6 +497,164 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
vmovss %xmm0, 1280(%rsp,%r15,8)
jmp .LBL_2_7
#endif
+END (_ZGVeN16vl4l4_sincosf_skx)
+libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
+
+/* Wrapper between vvv and vl4l4 vector variants. */
+.macro WRAPPER_AVX512_vvv_vl4l4 callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $384, %rsp
+ /* Encoding for vmovups %zmm1, 128(%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x4c
+ .byte 0x24
+ .byte 0x02
+ lea (%rsp), %rdi
+ /* Encoding for vmovups %zmm2, 192(%rdi). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x57
+ .byte 0x03
+ /* Encoding for vmovups %zmm3, 256(%rdi). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x5f
+ .byte 0x04
+ /* Encoding for vmovups %zmm4, 320(%rdi). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x67
+ .byte 0x05
+ lea 64(%rsp), %rsi
+ call HIDDEN_JUMPTARGET(\callee)
+ movq 128(%rsp), %rdx
+ movq 136(%rsp), %rsi
+ movq 144(%rsp), %r8
+ movq 152(%rsp), %r10
+ movl (%rsp), %eax
+ movl 4(%rsp), %ecx
+ movl 8(%rsp), %edi
+ movl 12(%rsp), %r9d
+ movl %eax, (%rdx)
+ movl %ecx, (%rsi)
+ movq 160(%rsp), %rax
+ movq 168(%rsp), %rcx
+ movl %edi, (%r8)
+ movl %r9d, (%r10)
+ movq 176(%rsp), %rdi
+ movq 184(%rsp), %r9
+ movl 16(%rsp), %r11d
+ movl 20(%rsp), %edx
+ movl 24(%rsp), %esi
+ movl 28(%rsp), %r8d
+ movl %r11d, (%rax)
+ movl %edx, (%rcx)
+ movq 192(%rsp), %r11
+ movq 200(%rsp), %rdx
+ movl %esi, (%rdi)
+ movl %r8d, (%r9)
+ movq 208(%rsp), %rsi
+ movq 216(%rsp), %r8
+ movl 32(%rsp), %r10d
+ movl 36(%rsp), %eax
+ movl 40(%rsp), %ecx
+ movl 44(%rsp), %edi
+ movl %r10d, (%r11)
+ movl %eax, (%rdx)
+ movq 224(%rsp), %r10
+ movq 232(%rsp), %rax
+ movl %ecx, (%rsi)
+ movl %edi, (%r8)
+ movq 240(%rsp), %rcx
+ movq 248(%rsp), %rdi
+ movl 48(%rsp), %r9d
+ movl 52(%rsp), %r11d
+ movl 56(%rsp), %edx
+ movl 60(%rsp), %esi
+ movl %r9d, (%r10)
+ movl %r11d, (%rax)
+ movq 256(%rsp), %r9
+ movq 264(%rsp), %r11
+ movl %edx, (%rcx)
+ movl %esi, (%rdi)
+ movq 272(%rsp), %rdx
+ movq 280(%rsp), %rsi
+ movl 64(%rsp), %r8d
+ movl 68(%rsp), %r10d
+ movl 72(%rsp), %eax
+ movl 76(%rsp), %ecx
+ movl %r8d, (%r9)
+ movl %r10d, (%r11)
+ movq 288(%rsp), %r8
+ movq 296(%rsp), %r10
+ movl %eax, (%rdx)
+ movl %ecx, (%rsi)
+ movq 304(%rsp), %rax
+ movq 312(%rsp), %rcx
+ movl 80(%rsp), %edi
+ movl 84(%rsp), %r9d
+ movl 88(%rsp), %r11d
+ movl 92(%rsp), %edx
+ movl %edi, (%r8)
+ movl %r9d, (%r10)
+ movq 320(%rsp), %rdi
+ movq 328(%rsp), %r9
+ movl %r11d, (%rax)
+ movl %edx, (%rcx)
+ movq 336(%rsp), %r11
+ movq 344(%rsp), %rdx
+ movl 96(%rsp), %esi
+ movl 100(%rsp), %r8d
+ movl 104(%rsp), %r10d
+ movl 108(%rsp), %eax
+ movl %esi, (%rdi)
+ movl %r8d, (%r9)
+ movq 352(%rsp), %rsi
+ movq 360(%rsp), %r8
+ movl %r10d, (%r11)
+ movl %eax, (%rdx)
+ movq 368(%rsp), %r10
+ movq 376(%rsp), %rax
+ movl 112(%rsp), %ecx
+ movl 116(%rsp), %edi
+ movl 120(%rsp), %r9d
+ movl 124(%rsp), %r11d
+ movl %ecx, (%rsi)
+ movl %edi, (%r8)
+ movl %r9d, (%r10)
+ movl %r11d, (%rax)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
+
+ENTRY (_ZGVeN16vvv_sincosf_knl)
+WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl
+END (_ZGVeN16vvv_sincosf_knl)
+
+ENTRY (_ZGVeN16vvv_sincosf_skx)
+WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
END (_ZGVeN16vvv_sincosf_skx)
.section .rodata, "a"
b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core_sse4.S
@@ -20,7 +20,7 @@
#include "svml_s_trig_data.h"
.text
-ENTRY (_ZGVbN4vvv_sincosf_sse4)
+ENTRY (_ZGVbN4vl4l4_sincosf_sse4)
/*
ALGORITHM DESCRIPTION:
@@ -265,4 +265,45 @@ ENTRY (_ZGVbN4vvv_sincosf_sse4)
movss %xmm0, 256(%rsp,%r15,8)
jmp .LBL_1_7
+END (_ZGVbN4vl4l4_sincosf_sse4)
+libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4)
+
+/* vvv version implemented with wrapper to vl4l4 variant. */
+ENTRY (_ZGVbN4vvv_sincosf_sse4)
+ subq $104, %rsp
+ .cfi_def_cfa_offset 112
+ movdqu %xmm1, 32(%rsp)
+ lea (%rsp), %rdi
+ movdqu %xmm2, 48(%rdi)
+ lea 16(%rsp), %rsi
+ movdqu %xmm3, 48(%rsi)
+ movdqu %xmm4, 64(%rsi)
+ call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
+ movq 32(%rsp), %rdx
+ movq 40(%rsp), %rsi
+ movq 48(%rsp), %r8
+ movq 56(%rsp), %r10
+ movl (%rsp), %eax
+ movl 4(%rsp), %ecx
+ movl 8(%rsp), %edi
+ movl 12(%rsp), %r9d
+ movl %eax, (%rdx)
+ movl %ecx, (%rsi)
+ movq 64(%rsp), %rax
+ movq 72(%rsp), %rcx
+ movl %edi, (%r8)
+ movl %r9d, (%r10)
+ movq 80(%rsp), %rdi
+ movq 88(%rsp), %r9
+ movl 16(%rsp), %r11d
+ movl 20(%rsp), %edx
+ movl 24(%rsp), %esi
+ movl 28(%rsp), %r8d
+ movl %r11d, (%rax)
+ movl %edx, (%rcx)
+ movl %esi, (%rdi)
+ movl %r8d, (%r9)
+ addq $104, %rsp
+ .cfi_def_cfa_offset 8
+ ret
END (_ZGVbN4vvv_sincosf_sse4)
b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core_avx2.S
@@ -20,7 +20,7 @@
#include "svml_s_trig_data.h"
.text
-ENTRY(_ZGVdN8vvv_sincosf_avx2)
+ENTRY (_ZGVdN8vl4l4_sincosf_avx2)
/*
ALGORITHM DESCRIPTION:
@@ -238,4 +238,77 @@ ENTRY(_ZGVdN8vvv_sincosf_avx2)
vmovss %xmm0, 384(%rsp,%r15,8)
jmp .LBL_1_7
-END(_ZGVdN8vvv_sincosf_avx2)
+END (_ZGVdN8vl4l4_sincosf_avx2)
+libmvec_hidden_def(_ZGVdN8vl4l4_sincosf_avx2)
+
+/* vvv version implemented with wrapper to vl4l4 variant. */
+ENTRY (_ZGVdN8vvv_sincosf_avx2)
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-32, %rsp
+ subq $192, %rsp
+ vmovdqu %ymm1, 64(%rsp)
+ lea (%rsp), %rdi
+ vmovdqu %ymm2, 96(%rdi)
+ vmovdqu %ymm3, 128(%rdi)
+ vmovdqu %ymm4, 160(%rdi)
+ lea 32(%rsp), %rsi
+ call HIDDEN_JUMPTARGET(_ZGVdN8vl4l4_sincosf_avx2)
+ movq 64(%rsp), %rdx
+ movq 72(%rsp), %rsi
+ movq 80(%rsp), %r8
+ movq 88(%rsp), %r10
+ movl (%rsp), %eax
+ movl 4(%rsp), %ecx
+ movl 8(%rsp), %edi
+ movl 12(%rsp), %r9d
+ movl %eax, (%rdx)
+ movl %ecx, (%rsi)
+ movq 96(%rsp), %rax
+ movq 104(%rsp), %rcx
+ movl %edi, (%r8)
+ movl %r9d, (%r10)
+ movq 112(%rsp), %rdi
+ movq 120(%rsp), %r9
+ movl 16(%rsp), %r11d
+ movl 20(%rsp), %edx
+ movl 24(%rsp), %esi
+ movl 28(%rsp), %r8d
+ movl %r11d, (%rax)
+ movl %edx, (%rcx)
+ movq 128(%rsp), %r11
+ movq 136(%rsp), %rdx
+ movl %esi, (%rdi)
+ movl %r8d, (%r9)
+ movq 144(%rsp), %rsi
+ movq 152(%rsp), %r8
+ movl 32(%rsp), %r10d
+ movl 36(%rsp), %eax
+ movl 40(%rsp), %ecx
+ movl 44(%rsp), %edi
+ movl %r10d, (%r11)
+ movl %eax, (%rdx)
+ movq 160(%rsp), %r10
+ movq 168(%rsp), %rax
+ movl %ecx, (%rsi)
+ movl %edi, (%r8)
+ movq 176(%rsp), %rcx
+ movq 184(%rsp), %rdi
+ movl 48(%rsp), %r9d
+ movl 52(%rsp), %r11d
+ movl 56(%rsp), %edx
+ movl 60(%rsp), %esi
+ movl %r9d, (%r10)
+ movl %r11d, (%rax)
+ movl %edx, (%rcx)
+ movl %esi, (%rdi)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+END (_ZGVdN8vvv_sincosf_avx2)
b/sysdeps/x86_64/fpu/svml_d_sincos2_core.S
@@ -20,8 +20,13 @@
#include "svml_d_wrapper_impl.h"
.text
-ENTRY (_ZGVbN2vvv_sincos)
+ENTRY (_ZGVbN2vl8l8_sincos)
WRAPPER_IMPL_SSE2_fFF sincos
+END (_ZGVbN2vl8l8_sincos)
+libmvec_hidden_def (_ZGVbN2vl8l8_sincos)
+
+ENTRY (_ZGVbN2vvv_sincos)
+WRAPPER_IMPL_SSE2_fFF_vvv sincos
END (_ZGVbN2vvv_sincos)
#ifndef USE_MULTIARCH
b/sysdeps/x86_64/fpu/svml_d_sincos4_core.S
@@ -20,8 +20,13 @@
#include "svml_d_wrapper_impl.h"
.text
+ENTRY (_ZGVdN4vl8l8_sincos)
+WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos
+END (_ZGVdN4vl8l8_sincos)
+libmvec_hidden_def (_ZGVdN4vl8l8_sincos)
+
ENTRY (_ZGVdN4vvv_sincos)
-WRAPPER_IMPL_AVX_fFF _ZGVbN2vvv_sincos
+WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN2vl8l8_sincos
END (_ZGVdN4vvv_sincos)
#ifndef USE_MULTIARCH
b/sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S
@@ -20,6 +20,10 @@
#include "svml_d_wrapper_impl.h"
.text
+ENTRY (_ZGVcN4vl8l8_sincos)
+WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos
+END (_ZGVcN4vl8l8_sincos)
+
ENTRY (_ZGVcN4vvv_sincos)
-WRAPPER_IMPL_AVX_fFF _ZGVbN2vvv_sincos
+WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN2vl8l8_sincos
END (_ZGVcN4vvv_sincos)
b/sysdeps/x86_64/fpu/svml_d_sincos8_core.S
@@ -20,6 +20,10 @@
#include "svml_d_wrapper_impl.h"
.text
+ENTRY (_ZGVeN8vl8l8_sincos)
+WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
+END (_ZGVeN8vl8l8_sincos)
+
ENTRY (_ZGVeN8vvv_sincos)
-WRAPPER_IMPL_AVX512_fFF _ZGVdN4vvv_sincos
+WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN4vl8l8_sincos
END (_ZGVeN8vvv_sincos)
b/sysdeps/x86_64/fpu/svml_s_sincosf16_core.S
@@ -20,6 +20,10 @@
#include "svml_s_wrapper_impl.h"
.text
+ENTRY (_ZGVeN16vl4l4_sincosf)
+WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
+END (_ZGVeN16vl4l4_sincosf)
+
ENTRY (_ZGVeN16vvv_sincosf)
-WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
+WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf
END (_ZGVeN16vvv_sincosf)
b/sysdeps/x86_64/fpu/svml_s_sincosf4_core.S
@@ -16,13 +16,17 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
-ENTRY (_ZGVbN4vvv_sincosf)
+ENTRY (_ZGVbN4vl4l4_sincosf)
WRAPPER_IMPL_SSE2_fFF sincosf
+END (_ZGVbN4vl4l4_sincosf)
+libmvec_hidden_def (_ZGVbN4vl4l4_sincosf)
+
+ENTRY (_ZGVbN4vvv_sincosf)
+WRAPPER_IMPL_SSE2_fFF_vvv sincosf
END (_ZGVbN4vvv_sincosf)
#ifndef USE_MULTIARCH
b/sysdeps/x86_64/fpu/svml_s_sincosf8_core.S
@@ -20,8 +20,13 @@
#include "svml_s_wrapper_impl.h"
.text
+ENTRY (_ZGVdN8vl4l4_sincosf)
+WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf
+END (_ZGVdN8vl4l4_sincosf)
+libmvec_hidden_def (_ZGVdN8vl4l4_sincosf)
+
ENTRY (_ZGVdN8vvv_sincosf)
-WRAPPER_IMPL_AVX_fFF _ZGVbN4vvv_sincosf
+WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN4vl4l4_sincosf
END (_ZGVdN8vvv_sincosf)
#ifndef USE_MULTIARCH
b/sysdeps/x86_64/fpu/svml_s_sincosf8_core_avx.S
@@ -20,6 +20,10 @@
#include "svml_s_wrapper_impl.h"
.text
-ENTRY(_ZGVcN8vvv_sincosf)
-WRAPPER_IMPL_AVX_fFF _ZGVbN4vvv_sincosf
-END(_ZGVcN8vvv_sincosf)
+ENTRY (_ZGVcN8vl4l4_sincosf)
+WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf
+END (_ZGVcN8vl4l4_sincosf)
+
+ENTRY (_ZGVcN8vvv_sincosf)
+WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN4vl4l4_sincosf
+END (_ZGVcN8vvv_sincosf)
b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
@@ -23,7 +23,28 @@
VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVbN2v_cos)
VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
-VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos)
VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
+
+#define VEC_INT_TYPE __m128i
+
+/* Redefinition of wrapper to be compatible with _ZGVbN2vvv_sincos. */
+#undef VECTOR_WRAPPER_fFF
+#define VECTOR_WRAPPER_fFF(scalar_func, vector_func) \
+extern void vector_func (VEC_TYPE, VEC_INT_TYPE, VEC_INT_TYPE); \
+void scalar_func (FLOAT x, FLOAT * r, FLOAT * r1) \
+{ \
+ int i; \
+ VEC_TYPE mx; \
+ VEC_INT_TYPE mr, mr1; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (mr, (long int)r, VEC_LEN); \
+ INIT_VEC_LOOP (mr1, (long int)r1, VEC_LEN); \
+ vector_func (mx, mr, mr1); \
+ TEST_VEC_LOOP (*(FLOAT*)mr, VEC_LEN); \
+ TEST_VEC_LOOP (*(FLOAT*)mr1, VEC_LEN); \
+ return; \
+}
+
+VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVbN2vvv_sincos)
b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@@ -26,7 +26,28 @@
VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVdN4v_cos)
VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
-VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVdN4vvv_sincos)
VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
+
+#define VEC_INT_TYPE __m256i
+
+/* Redefinition of wrapper to be compatible with _ZGVdN4vvv_sincos. */
+#undef VECTOR_WRAPPER_fFF
+#define VECTOR_WRAPPER_fFF(scalar_func, vector_func) \
+extern void vector_func (VEC_TYPE, VEC_INT_TYPE, VEC_INT_TYPE); \
+void scalar_func (FLOAT x, FLOAT * r, FLOAT * r1) \
+{ \
+ int i; \
+ VEC_TYPE mx; \
+ VEC_INT_TYPE mr, mr1; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (mr, (long int)r, VEC_LEN); \
+ INIT_VEC_LOOP (mr1, (long int)r1, VEC_LEN); \
+ vector_func (mx, mr, mr1); \
+ TEST_VEC_LOOP (*(FLOAT*)mr, VEC_LEN); \
+ TEST_VEC_LOOP (*(FLOAT*)mr1, VEC_LEN); \
+ return; \
+}
+
+VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVdN4vvv_sincos)
b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@@ -23,7 +23,29 @@
VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVcN4v_cos)
VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
-VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos)
VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
+
+#define VEC_INT_TYPE __m128i
+
+/* Redefinition of wrapper to be compatible with _ZGVcN4vvv_sincos. */
+#undef VECTOR_WRAPPER_fFF
+#define VECTOR_WRAPPER_fFF(scalar_func, vector_func) \
+extern void vector_func (VEC_TYPE, VEC_INT_TYPE, VEC_INT_TYPE, \
+ VEC_INT_TYPE, VEC_INT_TYPE); \
+void scalar_func (FLOAT x, FLOAT * r, FLOAT * r1) \
+{ \
+ int i; \
+ VEC_TYPE mx; \
+ VEC_INT_TYPE mr, mr1; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (mr, (long int)r, VEC_LEN/2); \
+ INIT_VEC_LOOP (mr1, (long int)r1, VEC_LEN/2); \
+ vector_func (mx, mr, mr, mr1, mr1); \
+ TEST_VEC_LOOP (*(FLOAT*)mr, VEC_LEN/2); \
+ TEST_VEC_LOOP (*(FLOAT*)mr1, VEC_LEN/2); \
+ return; \
+}
+
+VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVcN4vvv_sincos)
b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@@ -23,7 +23,28 @@
VECTOR_WRAPPER (WRAPPER_NAME (cos), _ZGVeN8v_cos)
VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
-VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVeN8vvv_sincos)
VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
+
+#define VEC_INT_TYPE __m512i
+
+/* Redefinition of wrapper to be compatible with _ZGVeN8vvv_sincos. */
+#undef VECTOR_WRAPPER_fFF
+#define VECTOR_WRAPPER_fFF(scalar_func, vector_func) \
+extern void vector_func (VEC_TYPE, VEC_INT_TYPE, VEC_INT_TYPE); \
+void scalar_func (FLOAT x, FLOAT * r, FLOAT * r1) \
+{ \
+ int i; \
+ VEC_TYPE mx; \
+ VEC_INT_TYPE mr, mr1; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (mr, (long int)r, VEC_LEN); \
+ INIT_VEC_LOOP (mr1, (long int)r1, VEC_LEN); \
+ vector_func (mx, mr, mr1); \
+ TEST_VEC_LOOP (*(FLOAT*)mr, VEC_LEN); \
+ TEST_VEC_LOOP (*(FLOAT*)mr1, VEC_LEN); \
+ return; \
+}
+
+VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincos), _ZGVeN8vvv_sincos)
b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@@ -23,7 +23,29 @@
VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVeN16v_cosf)
VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
-VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf)
VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
+
+#define VEC_INT_TYPE __m512i
+
+/* Redefinition of wrapper to be compatible with _ZGVeN16vvv_sincosf. */
+#undef VECTOR_WRAPPER_fFF
+#define VECTOR_WRAPPER_fFF(scalar_func, vector_func) \
+extern void vector_func (VEC_TYPE, VEC_INT_TYPE, VEC_INT_TYPE, \
+ VEC_INT_TYPE, VEC_INT_TYPE); \
+void scalar_func (FLOAT x, FLOAT * r, FLOAT * r1) \
+{ \
+ int i; \
+ VEC_TYPE mx; \
+ VEC_INT_TYPE mr, mr1; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (mr, (long int)r, VEC_LEN/2); \
+ INIT_VEC_LOOP (mr1, (long int)r1, VEC_LEN/2); \
+ vector_func (mx, mr, mr, mr1, mr1); \
+ TEST_VEC_LOOP (*(FLOAT*)mr, VEC_LEN/2); \
+ TEST_VEC_LOOP (*(FLOAT*)mr1, VEC_LEN/2); \
+ return; \
+}
+
+VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVeN16vvv_sincosf)
b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@@ -23,7 +23,29 @@
VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf)
VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
-VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf)
VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
+
+#define VEC_INT_TYPE __m128i
+
+/* Redefinition of wrapper to be compatible with _ZGVbN4vvv_sincosf. */
+#undef VECTOR_WRAPPER_fFF
+#define VECTOR_WRAPPER_fFF(scalar_func, vector_func) \
+extern void vector_func (VEC_TYPE, VEC_INT_TYPE, VEC_INT_TYPE, \
+ VEC_INT_TYPE, VEC_INT_TYPE); \
+void scalar_func (FLOAT x, FLOAT * r, FLOAT * r1) \
+{ \
+ int i; \
+ VEC_TYPE mx; \
+ VEC_INT_TYPE mr, mr1; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (mr, (long int)r, VEC_LEN/2); \
+ INIT_VEC_LOOP (mr1, (long int)r1, VEC_LEN/2); \
+ vector_func (mx, mr, mr, mr1, mr1); \
+ TEST_VEC_LOOP (*(FLOAT*)mr, VEC_LEN/2); \
+ TEST_VEC_LOOP (*(FLOAT*)mr1, VEC_LEN/2); \
+ return; \
+}
+
+VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf)
b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@@ -26,7 +26,29 @@
VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVdN8v_cosf)
VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
-VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf)
VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
+
+#define VEC_INT_TYPE __m256i
+
+/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
+#undef VECTOR_WRAPPER_fFF
+#define VECTOR_WRAPPER_fFF(scalar_func, vector_func) \
+extern void vector_func (VEC_TYPE, VEC_INT_TYPE, VEC_INT_TYPE, \
+ VEC_INT_TYPE, VEC_INT_TYPE); \
+void scalar_func (FLOAT x, FLOAT * r, FLOAT * r1) \
+{ \
+ int i; \
+ VEC_TYPE mx; \
+ VEC_INT_TYPE mr, mr1; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (mr, (long int)r, VEC_LEN/2); \
+ INIT_VEC_LOOP (mr1, (long int)r1, VEC_LEN/2); \
+ vector_func (mx, mr, mr, mr1, mr1); \
+ TEST_VEC_LOOP (*(FLOAT*)mr, VEC_LEN/2); \
+ TEST_VEC_LOOP (*(FLOAT*)mr1, VEC_LEN/2); \
+ return; \
+}
+
+VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVdN8vvv_sincosf)
b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@@ -23,7 +23,31 @@
VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVcN8v_cosf)
VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
-VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf)
VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
+
+#define VEC_INT_TYPE __m128i
+
+/* Redefinition of wrapper to be compatible with _ZGVcN8vvv_sincosf. */
+#undef VECTOR_WRAPPER_fFF
+#define VECTOR_WRAPPER_fFF(scalar_func, vector_func) \
+extern void vector_func (VEC_TYPE, VEC_INT_TYPE, VEC_INT_TYPE, \
+ VEC_INT_TYPE, VEC_INT_TYPE, \
+ VEC_INT_TYPE, VEC_INT_TYPE, \
+ VEC_INT_TYPE, VEC_INT_TYPE); \
+void scalar_func (FLOAT x, FLOAT * r, FLOAT * r1) \
+{ \
+ int i; \
+ VEC_TYPE mx; \
+ VEC_INT_TYPE mr, mr1; \
+ INIT_VEC_LOOP (mx, x, VEC_LEN); \
+ INIT_VEC_LOOP (mr, (long int)r, VEC_LEN/4); \
+ INIT_VEC_LOOP (mr1, (long int)r1, VEC_LEN/4); \
+ vector_func (mx, mr, mr, mr, mr, mr1, mr1, mr1, mr1); \
+ TEST_VEC_LOOP (*(FLOAT*)mr, VEC_LEN/4); \
+ TEST_VEC_LOOP (*(FLOAT*)mr1, VEC_LEN/4); \
+ return; \
+}
+
+VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVcN8vvv_sincosf)