[v3,1/1] x86-64: Add vector acos/acosf implementation to libmvec
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Implement vectorized acos/acosf containing SSE, AVX, AVX2 and AVX512
versions for libmvec as per vector ABI. It also contains accuracy and
ABI tests for vector acos/acosf with regenerated ulps.
---
bits/libm-simd-decl-stubs.h | 11 +
math/bits/mathcalls.h | 2 +-
.../unix/sysv/linux/x86_64/libmvec.abilist | 8 +
sysdeps/x86/fpu/bits/math-vector.h | 4 +
.../x86/fpu/finclude/math-vector-fortran.h | 4 +
sysdeps/x86_64/fpu/Makeconfig | 1 +
sysdeps/x86_64/fpu/Versions | 4 +
sysdeps/x86_64/fpu/libm-test-ulps | 20 +
.../fpu/multiarch/ifunc-mathvec-avx512-skx.h | 39 ++
.../fpu/multiarch/svml_d_acos2_core-sse2.S | 20 +
.../x86_64/fpu/multiarch/svml_d_acos2_core.c | 27 ++
.../fpu/multiarch/svml_d_acos2_core_sse4.S | 399 ++++++++++++++++++
.../fpu/multiarch/svml_d_acos4_core-sse.S | 20 +
.../x86_64/fpu/multiarch/svml_d_acos4_core.c | 27 ++
.../fpu/multiarch/svml_d_acos4_core_avx2.S | 368 ++++++++++++++++
.../fpu/multiarch/svml_d_acos8_core-avx2.S | 20 +
.../x86_64/fpu/multiarch/svml_d_acos8_core.c | 27 ++
.../fpu/multiarch/svml_d_acos8_core_avx512.S | 386 +++++++++++++++++
.../fpu/multiarch/svml_s_acosf16_core-avx2.S | 20 +
.../fpu/multiarch/svml_s_acosf16_core.c | 28 ++
.../multiarch/svml_s_acosf16_core_avx512.S | 332 +++++++++++++++
.../fpu/multiarch/svml_s_acosf4_core-sse2.S | 20 +
.../x86_64/fpu/multiarch/svml_s_acosf4_core.c | 28 ++
.../fpu/multiarch/svml_s_acosf4_core_sse4.S | 351 +++++++++++++++
.../fpu/multiarch/svml_s_acosf8_core-sse.S | 20 +
.../x86_64/fpu/multiarch/svml_s_acosf8_core.c | 28 ++
.../fpu/multiarch/svml_s_acosf8_core_avx2.S | 332 +++++++++++++++
sysdeps/x86_64/fpu/svml_d_acos2_core.S | 29 ++
sysdeps/x86_64/fpu/svml_d_acos4_core.S | 29 ++
sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S | 25 ++
sysdeps/x86_64/fpu/svml_d_acos8_core.S | 25 ++
sysdeps/x86_64/fpu/svml_s_acosf16_core.S | 25 ++
sysdeps/x86_64/fpu/svml_s_acosf4_core.S | 29 ++
sysdeps/x86_64/fpu/svml_s_acosf8_core.S | 29 ++
sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S | 25 ++
.../x86_64/fpu/test-double-libmvec-acos-avx.c | 1 +
.../fpu/test-double-libmvec-acos-avx2.c | 1 +
.../fpu/test-double-libmvec-acos-avx512f.c | 1 +
sysdeps/x86_64/fpu/test-double-libmvec-acos.c | 3 +
.../x86_64/fpu/test-double-vlen2-wrappers.c | 1 +
.../fpu/test-double-vlen4-avx2-wrappers.c | 1 +
.../x86_64/fpu/test-double-vlen4-wrappers.c | 1 +
.../x86_64/fpu/test-double-vlen8-wrappers.c | 1 +
.../x86_64/fpu/test-float-libmvec-acosf-avx.c | 1 +
.../fpu/test-float-libmvec-acosf-avx2.c | 1 +
.../fpu/test-float-libmvec-acosf-avx512f.c | 1 +
sysdeps/x86_64/fpu/test-float-libmvec-acosf.c | 3 +
.../x86_64/fpu/test-float-vlen16-wrappers.c | 1 +
.../x86_64/fpu/test-float-vlen4-wrappers.c | 1 +
.../fpu/test-float-vlen8-avx2-wrappers.c | 1 +
.../x86_64/fpu/test-float-vlen8-wrappers.c | 1 +
51 files changed, 2781 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
create mode 100644 sysdeps/x86_64/fpu/svml_d_acos2_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
create mode 100644 sysdeps/x86_64/fpu/svml_d_acos8_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf16_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf4_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos.c
create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
Comments
On Wed, Dec 15, 2021 at 12:55 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Implement vectorized acos/acosf containing SSE, AVX, AVX2 and AVX512
> versions for libmvec as per vector ABI. It also contains accuracy and
> ABI tests for vector acos/acosf with regenerated ulps.
> ---
> bits/libm-simd-decl-stubs.h | 11 +
> math/bits/mathcalls.h | 2 +-
> .../unix/sysv/linux/x86_64/libmvec.abilist | 8 +
> sysdeps/x86/fpu/bits/math-vector.h | 4 +
> .../x86/fpu/finclude/math-vector-fortran.h | 4 +
> sysdeps/x86_64/fpu/Makeconfig | 1 +
> sysdeps/x86_64/fpu/Versions | 4 +
> sysdeps/x86_64/fpu/libm-test-ulps | 20 +
> .../fpu/multiarch/ifunc-mathvec-avx512-skx.h | 39 ++
> .../fpu/multiarch/svml_d_acos2_core-sse2.S | 20 +
> .../x86_64/fpu/multiarch/svml_d_acos2_core.c | 27 ++
> .../fpu/multiarch/svml_d_acos2_core_sse4.S | 399 ++++++++++++++++++
> .../fpu/multiarch/svml_d_acos4_core-sse.S | 20 +
> .../x86_64/fpu/multiarch/svml_d_acos4_core.c | 27 ++
> .../fpu/multiarch/svml_d_acos4_core_avx2.S | 368 ++++++++++++++++
> .../fpu/multiarch/svml_d_acos8_core-avx2.S | 20 +
> .../x86_64/fpu/multiarch/svml_d_acos8_core.c | 27 ++
> .../fpu/multiarch/svml_d_acos8_core_avx512.S | 386 +++++++++++++++++
> .../fpu/multiarch/svml_s_acosf16_core-avx2.S | 20 +
> .../fpu/multiarch/svml_s_acosf16_core.c | 28 ++
> .../multiarch/svml_s_acosf16_core_avx512.S | 332 +++++++++++++++
> .../fpu/multiarch/svml_s_acosf4_core-sse2.S | 20 +
> .../x86_64/fpu/multiarch/svml_s_acosf4_core.c | 28 ++
> .../fpu/multiarch/svml_s_acosf4_core_sse4.S | 351 +++++++++++++++
> .../fpu/multiarch/svml_s_acosf8_core-sse.S | 20 +
> .../x86_64/fpu/multiarch/svml_s_acosf8_core.c | 28 ++
> .../fpu/multiarch/svml_s_acosf8_core_avx2.S | 332 +++++++++++++++
> sysdeps/x86_64/fpu/svml_d_acos2_core.S | 29 ++
> sysdeps/x86_64/fpu/svml_d_acos4_core.S | 29 ++
> sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S | 25 ++
> sysdeps/x86_64/fpu/svml_d_acos8_core.S | 25 ++
> sysdeps/x86_64/fpu/svml_s_acosf16_core.S | 25 ++
> sysdeps/x86_64/fpu/svml_s_acosf4_core.S | 29 ++
> sysdeps/x86_64/fpu/svml_s_acosf8_core.S | 29 ++
> sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S | 25 ++
> .../x86_64/fpu/test-double-libmvec-acos-avx.c | 1 +
> .../fpu/test-double-libmvec-acos-avx2.c | 1 +
> .../fpu/test-double-libmvec-acos-avx512f.c | 1 +
> sysdeps/x86_64/fpu/test-double-libmvec-acos.c | 3 +
> .../x86_64/fpu/test-double-vlen2-wrappers.c | 1 +
> .../fpu/test-double-vlen4-avx2-wrappers.c | 1 +
> .../x86_64/fpu/test-double-vlen4-wrappers.c | 1 +
> .../x86_64/fpu/test-double-vlen8-wrappers.c | 1 +
> .../x86_64/fpu/test-float-libmvec-acosf-avx.c | 1 +
> .../fpu/test-float-libmvec-acosf-avx2.c | 1 +
> .../fpu/test-float-libmvec-acosf-avx512f.c | 1 +
> sysdeps/x86_64/fpu/test-float-libmvec-acosf.c | 3 +
> .../x86_64/fpu/test-float-vlen16-wrappers.c | 1 +
> .../x86_64/fpu/test-float-vlen4-wrappers.c | 1 +
> .../fpu/test-float-vlen8-avx2-wrappers.c | 1 +
> .../x86_64/fpu/test-float-vlen8-wrappers.c | 1 +
> 51 files changed, 2781 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> create mode 100644 sysdeps/x86_64/fpu/svml_d_acos2_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
> create mode 100644 sysdeps/x86_64/fpu/svml_d_acos8_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf16_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf4_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
> create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
> create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
> create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
> create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos.c
> create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
> create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
> create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
> create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
>
> diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
> index b80ff332a0..2ccdd1fc53 100644
> --- a/bits/libm-simd-decl-stubs.h
> +++ b/bits/libm-simd-decl-stubs.h
> @@ -98,4 +98,15 @@
> #define __DECL_SIMD_powf32x
> #define __DECL_SIMD_powf64x
> #define __DECL_SIMD_powf128x
> +
> +#define __DECL_SIMD_acos
> +#define __DECL_SIMD_acosf
> +#define __DECL_SIMD_acosl
> +#define __DECL_SIMD_acosf16
> +#define __DECL_SIMD_acosf32
> +#define __DECL_SIMD_acosf64
> +#define __DECL_SIMD_acosf128
> +#define __DECL_SIMD_acosf32x
> +#define __DECL_SIMD_acosf64x
> +#define __DECL_SIMD_acosf128x
> #endif
> diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
> index da4cf4e10c..2cc6654208 100644
> --- a/math/bits/mathcalls.h
> +++ b/math/bits/mathcalls.h
> @@ -50,7 +50,7 @@
> /* Trigonometric functions. */
>
> /* Arc cosine of X. */
> -__MATHCALL (acos,, (_Mdouble_ __x));
> +__MATHCALL_VEC (acos,, (_Mdouble_ __x));
> /* Arc sine of X. */
> __MATHCALL (asin,, (_Mdouble_ __x));
> /* Arc tangent of X. */
> diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> index 363d4ace1e..b37b55777e 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> @@ -46,3 +46,11 @@ GLIBC_2.22 _ZGVeN8v_log F
> GLIBC_2.22 _ZGVeN8v_sin F
> GLIBC_2.22 _ZGVeN8vv_pow F
> GLIBC_2.22 _ZGVeN8vvv_sincos F
> +GLIBC_2.35 _ZGVbN2v_acos F
> +GLIBC_2.35 _ZGVbN4v_acosf F
> +GLIBC_2.35 _ZGVcN4v_acos F
> +GLIBC_2.35 _ZGVcN8v_acosf F
> +GLIBC_2.35 _ZGVdN4v_acos F
> +GLIBC_2.35 _ZGVdN8v_acosf F
> +GLIBC_2.35 _ZGVeN16v_acosf F
> +GLIBC_2.35 _ZGVeN8v_acos F
> diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h
> index dc0bfb3705..dabb74cbb9 100644
> --- a/sysdeps/x86/fpu/bits/math-vector.h
> +++ b/sysdeps/x86/fpu/bits/math-vector.h
> @@ -58,6 +58,10 @@
> # define __DECL_SIMD_pow __DECL_SIMD_x86_64
> # undef __DECL_SIMD_powf
> # define __DECL_SIMD_powf __DECL_SIMD_x86_64
> +# undef __DECL_SIMD_acos
> +# define __DECL_SIMD_acos __DECL_SIMD_x86_64
> +# undef __DECL_SIMD_acosf
> +# define __DECL_SIMD_acosf __DECL_SIMD_x86_64
>
> # endif
> #endif
> diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> index 311bb4e391..4bcbd1fbce 100644
> --- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> +++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> @@ -28,6 +28,8 @@
> !GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64')
> !GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64')
> !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
> +!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
> +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
>
> !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
> !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
> @@ -41,3 +43,5 @@
> !GCC$ builtin (expf) attributes simd (notinbranch) if('x32')
> !GCC$ builtin (pow) attributes simd (notinbranch) if('x32')
> !GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
> +!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
> +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
> diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig
> index b0e3bf7887..7acf1f306c 100644
> --- a/sysdeps/x86_64/fpu/Makeconfig
> +++ b/sysdeps/x86_64/fpu/Makeconfig
> @@ -22,6 +22,7 @@ postclean-generated += libmvec.mk
>
> # Define for both math and mathvec directories.
> libmvec-funcs = \
> + acos \
> cos \
> exp \
> log \
> diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
> index 08132045d6..3fd1379b17 100644
> --- a/sysdeps/x86_64/fpu/Versions
> +++ b/sysdeps/x86_64/fpu/Versions
> @@ -12,5 +12,9 @@ libmvec {
> _ZGVbN4v_expf; _ZGVcN8v_expf; _ZGVdN8v_expf; _ZGVeN16v_expf;
> _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf;
> _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf;
> +}
> + GLIBC_2.35 {
> + _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
> + _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
> }
> }
> diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
> index 312575f933..85a568ed29 100644
> --- a/sysdeps/x86_64/fpu/libm-test-ulps
> +++ b/sysdeps/x86_64/fpu/libm-test-ulps
> @@ -25,6 +25,26 @@ float: 1
> float128: 1
> ldouble: 2
>
> +Function: "acos_vlen16":
> +float: 1
> +
> +Function: "acos_vlen2":
> +double: 1
> +
> +Function: "acos_vlen4":
> +double: 1
> +float: 2
> +
> +Function: "acos_vlen4_avx2":
> +double: 1
> +
> +Function: "acos_vlen8":
> +double: 1
> +float: 2
> +
> +Function: "acos_vlen8_avx2":
> +float: 1
> +
> Function: "acosh":
> double: 2
> float: 2
> diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
> new file mode 100644
> index 0000000000..3aed563dde
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
> @@ -0,0 +1,39 @@
> +/* Common definition for libmathvec ifunc selections optimized with
> + AVX512.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <init-arch.h>
> +
> +#undef PASTER2
> +#define PASTER2(x,y) x##_##y
> +
> +extern void REDIRECT_NAME (void);
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> + const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ))
> + return OPTIMIZE (skx);
> +
> + return OPTIMIZE (avx2_wrapper);
> +}
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
> new file mode 100644
> index 0000000000..25fb8d0cac
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
> @@ -0,0 +1,20 @@
> +/* SSE2 version of vectorized acos, vector length is 2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2
> +#include "../svml_d_acos2_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
> new file mode 100644
> index 0000000000..5ba5d6fac2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVbN2v_acos
> +#include "ifunc-mathvec-sse4_1.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
> new file mode 100644
> index 0000000000..0c898e70ab
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
> @@ -0,0 +1,399 @@
> +/* Function acos vectorized with SSE4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * SelMask = (|x| >= 0.5) ? 1 : 0;
> + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define _SgnBit 0
> +#define _OneHalf 64
> +#define _SmallNorm 128
> +#define _dRsqrtMsk 192
> +#define _MOne 256
> +#define _HalfMask 320
> +#define _Two 384
> +#define _sqrt_coeff 448
> +#define _poly_coeff 512
> +#define _PiL 704
> +#define _PiH 768
> +#define _Pi2L 832
> +#define _Pi2H 896
> +#define _Zero 960
> +#define _SgnMask 1024
> +#define _NanMask 1088
> +#define _ep_coeff 1152
> +#define _dInfs 1280
> +#define _dOnes 1344
> +#define _dZeros 1408
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.sse4,"ax",@progbits
> +ENTRY(_ZGVbN2v_acos_sse4)
> + pushq %rbp
> + cfi_def_cfa_offset(16)
> + movq %rsp, %rbp
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> + andq $-64, %rsp
> + subq $256, %rsp
> + movups __svml_dacos_data_internal(%rip), %xmm3
> + movups _OneHalf+__svml_dacos_data_internal(%rip), %xmm6
> +
> +/* x = -|arg| */
> + movaps %xmm3, %xmm2
> + orps %xmm0, %xmm2
> +
> +/* Y = 0.5 + 0.5*(-x) */
> + movaps %xmm6, %xmm4
> + mulpd %xmm2, %xmm4
> + addpd %xmm4, %xmm6
> +
> +/* S ~ 2*sqrt(Y) */
> + cvtpd2ps %xmm6, %xmm7
> +
> +/* NaN processed in special branch (so wind test passed) */
> + movups _MOne+__svml_dacos_data_internal(%rip), %xmm1
> +
> +/* x^2 */
> + movaps %xmm2, %xmm5
> + cmpnlepd %xmm2, %xmm1
> + mulpd %xmm2, %xmm5
> + movmskpd %xmm1, %edx
> + movlhps %xmm7, %xmm7
> + andps %xmm0, %xmm3
> + movups %xmm8, 144(%rsp)
> + rsqrtps %xmm7, %xmm1
> + minpd %xmm6, %xmm5
> + cvtps2pd %xmm1, %xmm8
> + movaps %xmm6, %xmm1
> + movaps %xmm5, %xmm4
> + cmpltpd _SmallNorm+__svml_dacos_data_internal(%rip), %xmm1
> + cmpnltpd %xmm6, %xmm4
> + addpd %xmm6, %xmm6
> + andnps %xmm8, %xmm1
> + movups %xmm14, 176(%rsp)
> + movaps %xmm1, %xmm14
> + mulpd %xmm1, %xmm14
> + mulpd %xmm6, %xmm1
> + mulpd %xmm14, %xmm6
> + subpd _Two+__svml_dacos_data_internal(%rip), %xmm6
> + movups %xmm15, 160(%rsp)
> + movaps %xmm6, %xmm8
> + movups _sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm15
> + mulpd %xmm6, %xmm15
> + mulpd %xmm1, %xmm8
> + addpd _sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm15
> + mulpd %xmm6, %xmm15
> + addpd _sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm15
> + mulpd %xmm15, %xmm6
> + addpd _sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6
> + mulpd %xmm6, %xmm8
> +
> +/* polynomial */
> + movups _poly_coeff+__svml_dacos_data_internal(%rip), %xmm6
> + movaps %xmm5, %xmm15
> + mulpd %xmm5, %xmm6
> + mulpd %xmm5, %xmm15
> + addpd _poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm6
> + subpd %xmm8, %xmm1
> + mulpd %xmm15, %xmm6
> + movups _poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm14
> + andps %xmm4, %xmm1
> + mulpd %xmm5, %xmm14
> + movups _poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm7
> + mulpd %xmm5, %xmm7
> + addpd _poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm14
> + addpd _poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm7
> + addpd %xmm6, %xmm14
> + mulpd %xmm15, %xmm7
> + movups _poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm8
> + movaps %xmm15, %xmm6
> + mulpd %xmm5, %xmm8
> + mulpd %xmm15, %xmm6
> + addpd _poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm8
> + mulpd %xmm6, %xmm14
> + addpd %xmm7, %xmm8
> + movups _poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm7
> + mulpd %xmm5, %xmm7
> + addpd %xmm14, %xmm8
> + addpd _poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm7
> + mulpd %xmm15, %xmm8
> + movups _poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm6
> +
> +/* X<X^2 iff X<0 */
> + movaps %xmm0, %xmm14
> + addpd %xmm8, %xmm7
> + cmpltpd %xmm5, %xmm14
> + mulpd %xmm5, %xmm6
> + mulpd %xmm7, %xmm15
> + addpd _poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm6
> + addpd %xmm15, %xmm6
> + mulpd %xmm5, %xmm6
> + movaps %xmm4, %xmm7
> + movaps %xmm4, %xmm5
> + andnps %xmm2, %xmm7
> + orps %xmm1, %xmm7
> + pxor %xmm3, %xmm7
> + mulpd %xmm7, %xmm6
> + movups _PiH+__svml_dacos_data_internal(%rip), %xmm8
> + andps %xmm4, %xmm8
> + andnps _Pi2H+__svml_dacos_data_internal(%rip), %xmm5
> + andps %xmm14, %xmm8
> + addpd %xmm5, %xmm8
> + addpd %xmm6, %xmm7
> + addpd %xmm7, %xmm8
> + testl %edx, %edx
> +
> +/* Go to special inputs processing branch */
> + jne L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> + movups 176(%rsp), %xmm14
> + cfi_restore(31)
> + movaps %xmm8, %xmm0
> + movups 144(%rsp), %xmm8
> + cfi_restore(25)
> + movups 160(%rsp), %xmm15
> + cfi_restore(32)
> + movq %rbp, %rsp
> + popq %rbp
> + cfi_def_cfa(7, 8)
> + cfi_restore(6)
> + ret
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> + movups %xmm0, 128(%rsp)
> + movups %xmm8, 192(%rsp)
> + xorl %eax, %eax
> + movups %xmm9, 64(%rsp)
> + movups %xmm10, 48(%rsp)
> + movups %xmm11, 32(%rsp)
> + movups %xmm12, 16(%rsp)
> + movups %xmm13, (%rsp)
> + movq %rsi, 88(%rsp)
> + movq %rdi, 80(%rsp)
> + movq %r12, 112(%rsp)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -144; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x70, 0xff, 0xff, 0xff, 0x22
> + movl %eax, %r12d
> + movq %r13, 104(%rsp)
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -152; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x68, 0xff, 0xff, 0xff, 0x22
> + movl %edx, %r13d
> + movq %r14, 96(%rsp)
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -160; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x60, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> + btl %r12d, %r13d
> +
> +/* Call scalar math function */
> + jc L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> + incl %r12d
> + cmpl $2, %r12d
> +
> +/* Check bits in range mask */
> + jl L(RANGEMASK_CHECK)
> + movups 64(%rsp), %xmm9
> + cfi_restore(26)
> + movups 48(%rsp), %xmm10
> + cfi_restore(27)
> + movups 32(%rsp), %xmm11
> + cfi_restore(28)
> + movups 16(%rsp), %xmm12
> + cfi_restore(29)
> + movups (%rsp), %xmm13
> + cfi_restore(30)
> + movq 88(%rsp), %rsi
> + cfi_restore(4)
> + movq 80(%rsp), %rdi
> + cfi_restore(5)
> + movq 112(%rsp), %r12
> + cfi_restore(12)
> + movq 104(%rsp), %r13
> + cfi_restore(13)
> + movq 96(%rsp), %r14
> + cfi_restore(14)
> + movups 192(%rsp), %xmm8
> +
> +/* Go to exit */
> + jmp L(EXIT)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -144; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x70, 0xff, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -152; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x68, 0xff, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -160; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x60, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> + movl %r12d, %r14d
> + movsd 128(%rsp,%r14,8), %xmm0
> + call acos@PLT
> + movsd %xmm0, 192(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> + jmp L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVbN2v_acos_sse4)
> +
> + .section .rodata, "a"
> + .align 64
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(64)) VUINT32 SgnBit[2][2];
> + __declspec(align(64)) VUINT32 OneHalf[2][2];
> + __declspec(align(64)) VUINT32 SmallNorm[2][2];
> + __declspec(align(64)) VUINT32 dRsqrtMsk[2][2];
> + __declspec(align(64)) VUINT32 MOne[2][2];
> + __declspec(align(64)) VUINT32 HalfMask[2][2];
> + __declspec(align(64)) VUINT32 Two[2][2];
> + __declspec(align(64)) VUINT32 sqrt_coeff[4][2][2];
> + __declspec(align(64)) VUINT32 poly_coeff[12][2][2];
> + __declspec(align(64)) VUINT32 PiL[2][2];
> + __declspec(align(64)) VUINT32 PiH[2][2];
> + __declspec(align(64)) VUINT32 Pi2L[2][2];
> + __declspec(align(64)) VUINT32 Pi2H[2][2];
> + __declspec(align(64)) VUINT32 Zero[2][2];
> + __declspec(align(64)) VUINT32 SgnMask[2][2];
> + __declspec(align(64)) VUINT32 NanMask[2][2];
> + __declspec(align(64)) VUINT32 ep_coeff[6][2][2];
> + /* scalar part follow */
> + __declspec(align(64)) VUINT32 dInfs[2][2];
> + __declspec(align(64)) VUINT32 dOnes[2][2];
> + __declspec(align(64)) VUINT32 dZeros[2][2];
> +} __svml_dacos_data_internal_t;
> +#endif
> +__svml_dacos_data_internal:
> + /*== SgnBit ==*/
> + .quad 0x8000000000000000, 0x8000000000000000
> + /*== OneHalf ==*/
> + .align 64
> + .quad 0x3fe0000000000000, 0x3fe0000000000000
> + /*== SmallNorm ==*/
> + .align 64
> + .quad 0x3000000000000000, 0x3000000000000000
> + /*== dRsqrtMsk ==*/
> + .align 64
> + .quad 0xffffff0000000000, 0xffffff0000000000
> + /*== MOne ==*/
> + .align 64
> + .quad 0xbff0000000000000, 0xbff0000000000000
> + /*== HalfMask ==*/
> + .align 64
> + .quad 0xfffffffffc000000, 0xfffffffffc000000
> + /*== Two ==*/
> + .align 64
> + .quad 0x4000000000000000, 0x4000000000000000
> + /*== sqrt_coeff[4] ==*/
> + .align 64
> + .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> + /*== poly_coeff[12] ==*/
> + .align 64
> + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> + .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> + /*== PiL ==*/
> + .align 64
> + .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07
> + /*== PiH ==*/
> + .align 64
> + .quad 0x400921fb54442d18, 0x400921fb54442d18
> + /*== Pi2L ==*/
> + .align 64
> + .quad 0x3c91a62633145c07, 0x3c91a62633145c07
> + /*== Pi2H ==*/
> + .align 64
> + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
> + /*== Zero ==*/
> + .align 64
> + .quad 0x0000000000000000, 0x0000000000000000
> + /*== sgn mask ==*/
> + .align 64
> + .quad 0x8000000000000000, 0x8000000000000000
> + /*== NaN mask ==*/
> + .align 64
> + .quad 0xfffc000000000000, 0xfffc000000000000
> + /*== ep_coeff[6] ==*/
> + .align 64
> + .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
> + .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
> + .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
> + .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
> + .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
> + .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
> + /* scalar part follows */
> + /*== dInfs = DP infinity, +/- ==*/
> + .align 64
> + .quad 0x7ff0000000000000, 0xfff0000000000000
> + /*== dOnes = DP one, +/- ==*/
> + .align 64
> + .quad 0x3ff0000000000000, 0xbff0000000000000
> + /*== dZeros = DP zero +/- ==*/
> + .align 64
> + .quad 0x0000000000000000, 0x8000000000000000
> + .align 64
> + .type __svml_dacos_data_internal,@object
> + .size __svml_dacos_data_internal,1472
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
> new file mode 100644
> index 0000000000..750f71c81c
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
> @@ -0,0 +1,20 @@
> +/* SSE version of vectorized acos, vector length is 4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper
> +#include "../svml_d_acos4_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
> new file mode 100644
> index 0000000000..6453e7ebe2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVdN4v_acos
> +#include "ifunc-mathvec-avx2.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> new file mode 100644
> index 0000000000..684d501a3d
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> @@ -0,0 +1,368 @@
> +/* Function acos vectorized with AVX2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * SelMask = (|x| >= 0.5) ? 1 : 0;
> + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define _SgnBit 0
> +#define _OneHalf 64
> +#define _SmallNorm 128
> +#define _dRsqrtMsk 192
> +#define _MOne 256
> +#define _HalfMask 320
> +#define _Two 384
> +#define _sqrt_coeff 448
> +#define _poly_coeff 576
> +#define _PiL 960
> +#define _PiH 1024
> +#define _Pi2L 1088
> +#define _Pi2H 1152
> +#define _Zero 1216
> +#define _SgnMask 1280
> +#define _NanMask 1344
> +#define _ep_coeff 1408
> +#define _dInfs 1600
> +#define _dOnes 1664
> +#define _dZeros 1728
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.avx2,"ax",@progbits
> +ENTRY(_ZGVdN4v_acos_avx2)
> + pushq %rbp
> + cfi_def_cfa_offset(16)
> + movq %rsp, %rbp
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> + andq $-64, %rsp
> + subq $384, %rsp
> + vmovupd __svml_dacos_data_internal(%rip), %ymm6
> + vmovupd _OneHalf+__svml_dacos_data_internal(%rip), %ymm7
> + vmovups %ymm8, 96(%rsp)
> + vmovups %ymm10, 192(%rsp)
> + vmovups %ymm9, 160(%rsp)
> + vmovups %ymm11, 224(%rsp)
> + vmovups %ymm12, 256(%rsp)
> + vmovups %ymm13, 288(%rsp)
> + vmovups %ymm15, 352(%rsp)
> + vmovups %ymm14, 320(%rsp)
> + vmovapd %ymm0, %ymm5
> +
> +/* x = -|arg| */
> + vorpd %ymm5, %ymm6, %ymm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> + vfmadd231pd %ymm4, %ymm7, %ymm7
> +
> +/* x^2 */
> + vmulpd %ymm4, %ymm4, %ymm1
> +
> +/* NaN processed in special branch (so wind test passed) */
> + vcmpnge_uqpd _MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm0
> +
> +/* S ~ 2*sqrt(Y) */
> + vcmplt_oqpd _SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm9
> + vaddpd %ymm7, %ymm7, %ymm13
> + vminpd %ymm7, %ymm1, %ymm2
> + vandpd %ymm5, %ymm6, %ymm3
> + vcvtpd2ps %ymm7, %xmm6
> + vcmpnlt_uqpd %ymm7, %ymm2, %ymm1
> + vmovupd _poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm7
> + vrsqrtps %xmm6, %xmm8
> + vmovupd _poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm6
> + vfmadd213pd _poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm7
> + vcvtps2pd %xmm8, %ymm10
> + vfmadd213pd _poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm6
> + vandnpd %ymm10, %ymm9, %ymm11
> + vmulpd %ymm11, %ymm11, %ymm12
> + vmulpd %ymm13, %ymm11, %ymm15
> + vmovupd _poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm9
> + vmulpd %ymm2, %ymm2, %ymm11
> + vmovupd _poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm10
> + vfmsub213pd _Two+__svml_dacos_data_internal(%rip), %ymm12, %ymm13
> + vmovupd _poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm12
> + vfmadd213pd _poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm9
> + vmulpd %ymm11, %ymm11, %ymm8
> + vfmadd213pd _poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm10
> + vmulpd %ymm13, %ymm15, %ymm14
> + vfmadd213pd _poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm12
> + vfmadd213pd %ymm7, %ymm11, %ymm9
> + vmovmskpd %ymm0, %edx
> + vmovupd _sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0
> + vfmadd213pd _sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
> + vfmadd213pd _sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
> + vfmadd213pd _sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
> +
> +/* polynomial */
> + vmovupd _poly_coeff+__svml_dacos_data_internal(%rip), %ymm13
> + vfnmadd213pd %ymm15, %ymm14, %ymm0
> + vfmadd213pd _poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm13
> + vblendvpd %ymm1, %ymm0, %ymm4, %ymm4
> + vfmadd213pd %ymm6, %ymm11, %ymm13
> +
> +/* X<X^2 iff X<0 */
> + vcmplt_oqpd %ymm2, %ymm5, %ymm6
> + vfmadd213pd %ymm9, %ymm8, %ymm13
> + vfmadd213pd %ymm10, %ymm11, %ymm13
> + vfmadd213pd %ymm12, %ymm11, %ymm13
> + vmulpd %ymm13, %ymm2, %ymm14
> + vxorpd %ymm3, %ymm4, %ymm3
> + vandpd _PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2
> + vfmadd213pd %ymm3, %ymm3, %ymm14
> + vandpd %ymm6, %ymm2, %ymm2
> + vandnpd _Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7
> + vaddpd %ymm7, %ymm2, %ymm8
> + vaddpd %ymm14, %ymm8, %ymm0
> + testl %edx, %edx
> +
> +/* Go to special inputs processing branch */
> + jne L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> + vmovups 96(%rsp), %ymm8
> + cfi_restore(91)
> + vmovups 160(%rsp), %ymm9
> + cfi_restore(92)
> + vmovups 192(%rsp), %ymm10
> + cfi_restore(93)
> + vmovups 224(%rsp), %ymm11
> + cfi_restore(94)
> + vmovups 256(%rsp), %ymm12
> + cfi_restore(95)
> + vmovups 288(%rsp), %ymm13
> + cfi_restore(96)
> + vmovups 320(%rsp), %ymm14
> + cfi_restore(97)
> + vmovups 352(%rsp), %ymm15
> + cfi_restore(98)
> + movq %rbp, %rsp
> + popq %rbp
> + cfi_def_cfa(7, 8)
> + cfi_restore(6)
> + ret
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> + vmovupd %ymm5, 64(%rsp)
> + vmovupd %ymm0, 128(%rsp)
> +
> +/* Go to exit */
> + je L(EXIT)
> + xorl %eax, %eax
> + vzeroupper
> + movq %rsi, 8(%rsp)
> + movq %rdi, (%rsp)
> + movq %r12, 32(%rsp)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
> + movl %eax, %r12d
> + movq %r13, 24(%rsp)
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
> + movl %edx, %r13d
> + movq %r14, 16(%rsp)
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> + btl %r12d, %r13d
> +
> +/* Call scalar math function */
> + jc L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> + incl %r12d
> + cmpl $4, %r12d
> +
> +/* Check bits in range mask */
> + jl L(RANGEMASK_CHECK)
> + movq 8(%rsp), %rsi
> + cfi_restore(4)
> + movq (%rsp), %rdi
> + cfi_restore(5)
> + movq 32(%rsp), %r12
> + cfi_restore(12)
> + movq 24(%rsp), %r13
> + cfi_restore(13)
> + movq 16(%rsp), %r14
> + cfi_restore(14)
> + vmovupd 128(%rsp), %ymm0
> +
> +/* Go to exit */
> + jmp L(EXIT)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> + movl %r12d, %r14d
> + movsd 64(%rsp,%r14,8), %xmm0
> + call acos@PLT
> + movsd %xmm0, 128(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> + jmp L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVdN4v_acos_avx2)
> +
> + .section .rodata, "a"
> + .align 64
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(64)) VUINT32 SgnBit[4][2];
> + __declspec(align(64)) VUINT32 OneHalf[4][2];
> + __declspec(align(64)) VUINT32 SmallNorm[4][2];
> + __declspec(align(64)) VUINT32 dRsqrtMsk[4][2];
> + __declspec(align(64)) VUINT32 MOne[4][2];
> + __declspec(align(64)) VUINT32 HalfMask[4][2];
> + __declspec(align(64)) VUINT32 Two[4][2];
> + __declspec(align(64)) VUINT32 sqrt_coeff[4][4][2];
> + __declspec(align(64)) VUINT32 poly_coeff[12][4][2];
> + __declspec(align(64)) VUINT32 PiL[4][2];
> + __declspec(align(64)) VUINT32 PiH[4][2];
> + __declspec(align(64)) VUINT32 Pi2L[4][2];
> + __declspec(align(64)) VUINT32 Pi2H[4][2];
> + __declspec(align(64)) VUINT32 Zero[4][2];
> + __declspec(align(64)) VUINT32 SgnMask[4][2];
> + __declspec(align(64)) VUINT32 NanMask[4][2];
> + __declspec(align(64)) VUINT32 ep_coeff[6][4][2];
> + /* scalar part follow */
> + __declspec(align(64)) VUINT32 dInfs[2][2];
> + __declspec(align(64)) VUINT32 dOnes[2][2];
> + __declspec(align(64)) VUINT32 dZeros[2][2];
> +} __svml_dacos_data_internal_t;
> +#endif
> +__svml_dacos_data_internal:
> + /*== SgnBit ==*/
> + .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> + /*== OneHalf ==*/
> + .align 64
> + .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
> + /*== SmallNorm ==*/
> + .align 64
> + .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
> + /*== dRsqrtMsk ==*/
> + .align 64
> + .quad 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000
> + /*== MOne ==*/
> + .align 64
> + .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
> + /*== HalfMask ==*/
> + .align 64
> + .quad 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000
> + /*== Two ==*/
> + .align 64
> + .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
> + /*== sqrt_coeff[4] ==*/
> + .align 64
> + .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> + /*== poly_coeff[12] ==*/
> + .align 64
> + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> + .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> + /*== PiL ==*/
> + .align 64
> + .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07
> + /*== PiH ==*/
> + .align 64
> + .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
> + /*== Pi2L ==*/
> + .align 64
> + .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
> + /*== Pi2H ==*/
> + .align 64
> + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
> + /*== Zero ==*/
> + .align 64
> + .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
> + /*== sgn mask ==*/
> + .align 64
> + .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> + /*== NaN mask ==*/
> + .align 64
> + .quad 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000
> + /*== ep_coeff[6] ==*/
> + .align 64
> + .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
> + .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
> + .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
> + .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
> + .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
> + .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
> + /* scalar part follows */
> + /*== dInfs = DP infinity, +/- ==*/
> + .align 64
> + .quad 0x7ff0000000000000, 0xfff0000000000000
> + /*== dOnes = DP one, +/- ==*/
> + .align 64
> + .quad 0x3ff0000000000000, 0xbff0000000000000
> + /*== dZeros = DP zero +/- ==*/
> + .align 64
> + .quad 0x0000000000000000, 0x8000000000000000
> + .align 64
> + .type __svml_dacos_data_internal,@object
> + .size __svml_dacos_data_internal,1792
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
> new file mode 100644
> index 0000000000..4d64fd1c00
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
> @@ -0,0 +1,20 @@
> +/* AVX2 version of vectorized acos, vector length is 8.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper
> +#include "../svml_d_acos8_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
> new file mode 100644
> index 0000000000..1e7d1865fb
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 8.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVeN8v_acos
> +#include "ifunc-mathvec-avx512-skx.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> new file mode 100644
> index 0000000000..52832893ec
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> @@ -0,0 +1,386 @@
> +/* Function acos vectorized with AVX-512.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * SelMask = (|x| >= 0.5) ? 1 : 0;
> + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define _SgnBit 0
> +#define _OneHalf 64
> +#define _SmallNorm 128
> +#define _dRsqrtMsk 192
> +#define _MOne 256
> +#define _HalfMask 320
> +#define _Two 384
> +#define _sqrt_coeff_1 448
> +#define _sqrt_coeff_2 512
> +#define _sqrt_coeff_3 576
> +#define _sqrt_coeff_4 640
> +#define _poly_coeff_1 704
> +#define _poly_coeff_2 768
> +#define _poly_coeff_3 832
> +#define _poly_coeff_4 896
> +#define _poly_coeff_5 960
> +#define _poly_coeff_6 1024
> +#define _poly_coeff_7 1088
> +#define _poly_coeff_8 1152
> +#define _poly_coeff_9 1216
> +#define _poly_coeff_10 1280
> +#define _poly_coeff_11 1344
> +#define _poly_coeff_12 1408
> +#define _PiL 1472
> +#define _PiH 1536
> +#define _Pi2L 1600
> +#define _Pi2H 1664
> +#define _Zero 1728
> +#define _SgnMask 1792
> +#define _NanMask 1856
> +#define _ep_coeff_1 1920
> +#define _ep_coeff_2 1984
> +#define _ep_coeff_3 2048
> +#define _ep_coeff_4 2112
> +#define _ep_coeff_5 2176
> +#define _ep_coeff_6 2240
> +#define _dInfs 2304
> +#define _dOnes 2368
> +#define _dZeros 2432
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.evex512,"ax",@progbits
> +ENTRY(_ZGVeN8v_acos_skx)
> + pushq %rbp
> + cfi_def_cfa_offset(16)
> + movq %rsp, %rbp
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> + andq $-64, %rsp
> + subq $256, %rsp
> + vmovups __svml_dacos_data_internal(%rip), %zmm7
> + vmovups _OneHalf+__svml_dacos_data_internal(%rip), %zmm8
> +
> +/* S ~ 2*sqrt(Y) */
> + vmovups _SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
> + vmovups _Two+__svml_dacos_data_internal(%rip), %zmm14
> + vmovups _sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
> + vmovups _sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
> + vmovups _sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
> + vmovups _MOne+__svml_dacos_data_internal(%rip), %zmm10
> + vmovaps %zmm0, %zmm6
> +
> +/* x = -|arg| */
> + vorpd %zmm6, %zmm7, %zmm5
> + vandpd %zmm6, %zmm7, %zmm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> + vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
> +
> +/* x^2 */
> + vmulpd {rn-sae}, %zmm5, %zmm5, %zmm9
> + vrsqrt14pd %zmm8, %zmm12
> + vcmppd $17, {sae}, %zmm11, %zmm8, %k2
> + vcmppd $17, {sae}, %zmm10, %zmm5, %k0
> + vmovups _poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
> + vmovups _poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
> + vminpd {sae}, %zmm8, %zmm9, %zmm3
> + vmovups _poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
> + vxorpd %zmm12, %zmm12, %zmm12{%k2}
> + vaddpd {rn-sae}, %zmm8, %zmm8, %zmm0
> + vcmppd $21, {sae}, %zmm8, %zmm3, %k1
> +
> +/* X<X^2 iff X<0 */
> + vcmppd $17, {sae}, %zmm3, %zmm6, %k3
> + vmulpd {rn-sae}, %zmm12, %zmm12, %zmm13
> + vmulpd {rn-sae}, %zmm12, %zmm0, %zmm7
> + vmovups _poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
> +
> +/* polynomial */
> + vmovups _poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
> + vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
> + vmovups _sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
> + vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
> + vmovups _poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
> + vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
> + vmovups _poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
> + vmulpd {rn-sae}, %zmm0, %zmm7, %zmm14
> + vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
> + vmovups _poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
> + kmovw %k1, %eax
> + kmovw %k3, %ecx
> + kmovw %k0, %edx
> + vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
> + vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
> + vmovups _poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
> + vmulpd {rn-sae}, %zmm3, %zmm3, %zmm0
> + vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
> + vmovups _poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
> + vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
> + vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
> + vblendmpd %zmm2, %zmm5, %zmm2{%k1}
> + vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
> + vmovups _poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
> + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> + andl %eax, %ecx
> + vmovups _poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> + kmovw %ecx, %k2
> + vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
> + vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
> + vmulpd {rn-sae}, %zmm0, %zmm0, %zmm10
> + vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
> + vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
> + vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
> + vmovups _Pi2H+__svml_dacos_data_internal(%rip), %zmm0
> + vmulpd {rn-sae}, %zmm3, %zmm1, %zmm1
> + vxorpd %zmm4, %zmm2, %zmm3
> + vxorpd %zmm0, %zmm0, %zmm0{%k1}
> + vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
> + vorpd _PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k2}
> + vaddpd {rn-sae}, %zmm1, %zmm0, %zmm0
> + testl %edx, %edx
> +
> +/* Go to special inputs processing branch */
> + jne L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> + movq %rbp, %rsp
> + popq %rbp
> + cfi_def_cfa(7, 8)
> + cfi_restore(6)
> + ret
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> + vmovups %zmm6, 128(%rsp)
> + vmovups %zmm0, 192(%rsp)
> +
> +/* Go to exit */
> + je L(EXIT)
> + xorl %eax, %eax
> + vzeroupper
> + kmovw %k4, 24(%rsp)
> + kmovw %k5, 16(%rsp)
> + kmovw %k6, 8(%rsp)
> + kmovw %k7, (%rsp)
> + movq %rsi, 40(%rsp)
> + movq %rdi, 32(%rsp)
> + movq %r12, 64(%rsp)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> + movl %eax, %r12d
> + movq %r13, 56(%rsp)
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
> + movl %edx, %r13d
> + movq %r14, 48(%rsp)
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> + btl %r12d, %r13d
> +
> +/* Call scalar math function */
> + jc L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> + incl %r12d
> + cmpl $8, %r12d
> +
> +/* Check bits in range mask */
> + jl L(RANGEMASK_CHECK)
> + kmovw 24(%rsp), %k4
> + cfi_restore(122)
> + kmovw 16(%rsp), %k5
> + cfi_restore(123)
> + kmovw 8(%rsp), %k6
> + cfi_restore(124)
> + kmovw (%rsp), %k7
> + cfi_restore(125)
> + vmovups 192(%rsp), %zmm0
> + movq 40(%rsp), %rsi
> + cfi_restore(4)
> + movq 32(%rsp), %rdi
> + cfi_restore(5)
> + movq 64(%rsp), %r12
> + cfi_restore(12)
> + movq 56(%rsp), %r13
> + cfi_restore(13)
> + movq 48(%rsp), %r14
> + cfi_restore(14)
> +
> +/* Go to exit */
> + jmp L(EXIT)
Is there a control flow from L(SPECIAL_VALUES_BRANCH) that doesn't go to exit?
If not I think alot of the save/restores are pretty unnecissary.
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> + movl %r12d, %r14d
> + movsd 128(%rsp,%r14,8), %xmm0
> + call acos@PLT
> + movsd %xmm0, 192(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> + jmp L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVeN8v_acos_skx)
> +
> + .section .rodata, "a"
> + .align 64
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(64)) VUINT32 SgnBit[8][2];
> + __declspec(align(64)) VUINT32 OneHalf[8][2];
> + __declspec(align(64)) VUINT32 SmallNorm[8][2];
> + __declspec(align(64)) VUINT32 dRsqrtMsk[8][2];
> + __declspec(align(64)) VUINT32 MOne[8][2];
> + __declspec(align(64)) VUINT32 HalfMask[8][2];
> + __declspec(align(64)) VUINT32 Two[8][2];
> + __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
> + __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
> + __declspec(align(64)) VUINT32 PiL[8][2];
> + __declspec(align(64)) VUINT32 PiH[8][2];
> + __declspec(align(64)) VUINT32 Pi2L[8][2];
> + __declspec(align(64)) VUINT32 Pi2H[8][2];
> + __declspec(align(64)) VUINT32 Zero[8][2];
> + __declspec(align(64)) VUINT32 SgnMask[8][2];
> + __declspec(align(64)) VUINT32 NanMask[8][2];
> + __declspec(align(64)) VUINT32 ep_coeff[6][8][2];
> + /* scalar part follow */
> + __declspec(align(64)) VUINT32 dInfs[2][2];
> + __declspec(align(64)) VUINT32 dOnes[2][2];
> + __declspec(align(64)) VUINT32 dZeros[2][2];
> +} __svml_dacos_data_internal_t;
> +#endif
> +__svml_dacos_data_internal:
> + /*== SgnBit ==*/
> + .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> + /*== OneHalf ==*/
> + .align 64
> + .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
> + /*== SmallNorm ==*/
> + .align 64
> + .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
> + /*== dRsqrtMsk ==*/
> + .align 64
> + .quad 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000
> + /*== MOne ==*/
> + .align 64
> + .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
> + /*== HalfMask ==*/
> + .align 64
> + .quad 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000
> + /*== Two ==*/
> + .align 64
> + .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
> + /*== sqrt_coeff[4] ==*/
> + .align 64
> + .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> + /*== poly_coeff[12] ==*/
> + .align 64
> + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> + .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> + /*== PiL ==*/
> + .align 64
> + .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07
> + /*== PiH ==*/
> + .align 64
> + .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
> + /*== Pi2L ==*/
> + .align 64
> + .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
> + /*== Pi2H ==*/
> + .align 64
> + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
> + /*== Zero ==*/
> + .align 64
> + .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
> + /*== sgn mask ==*/
> + .align 64
> + .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> + /*== NaN mask ==*/
> + .align 64
> + .quad 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000
> + /*== ep_coeff[6] ==*/
> + .align 64
> + .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
> + .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
> + .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
> + .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
> + .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
> + .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
> + /* scalar part follows */
> + /*== dInfs = DP infinity, +/- ==*/
> + .align 64
> + .quad 0x7ff0000000000000, 0xfff0000000000000
> + /*== dOnes = DP one, +/- ==*/
> + .align 64
> + .quad 0x3ff0000000000000, 0xbff0000000000000
> + /*== dZeros = DP zero +/- ==*/
> + .align 64
> + .quad 0x0000000000000000, 0x8000000000000000
> + .align 64
> + .type __svml_dacos_data_internal,@object
> + .size __svml_dacos_data_internal,2496
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
> new file mode 100644
> index 0000000000..1ff0cfc8d5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
> @@ -0,0 +1,20 @@
> +/* AVX2 version of vectorized acosf.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper
> +#include "../svml_s_acosf16_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
> new file mode 100644
> index 0000000000..fcf05782c5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 16.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVeN16v_acosf
> +#include "ifunc-mathvec-avx512-skx.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf,
> + __redirect__ZGVeN16v_acosf)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> new file mode 100644
> index 0000000000..d30b04a607
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> @@ -0,0 +1,332 @@
> +/* Function acosf vectorized with AVX-512.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * SelMask = (|x| >= 0.5) ? 1 : 0;
> + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define _SgnBit 0
> +#define _OneHalf 64
> +#define _sRsqrtMsk 128
> +#define _SmallNorm 192
> +#define _MOne 256
> +#define _HalfMask 320
> +#define _SQMask 384
> +#define _Two 448
> +#define _sqrt_coeff_1 512
> +#define _sqrt_coeff_2 576
> +#define _poly_coeff_1 640
> +#define _poly_coeff_2 704
> +#define _poly_coeff_3 768
> +#define _poly_coeff_4 832
> +#define _poly_coeff_5 896
> +#define _Pi2H 960
> +#define _Pi2L 1024
> +#define _PiH 1088
> +#define _PiL 1152
> +#define _Zero 1216
> +#define _SgnMask 1280
> +#define _NanMask 1344
> +#define _ep_coeff_1 1408
> +#define _ep_coeff_2 1472
> +#define _ep_coeff_3 1536
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.exex512,"ax",@progbits
> +ENTRY(_ZGVeN16v_acosf_skx)
> + pushq %rbp
> + cfi_def_cfa_offset(16)
> + movq %rsp, %rbp
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> + andq $-64, %rsp
> + subq $256, %rsp
> + vmovups __svml_sacos_data_internal(%rip), %zmm5
> + vmovups _OneHalf+__svml_sacos_data_internal(%rip), %zmm6
> +
> +/* SQ ~ 2*sqrt(Y) */
> + vmovups _SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
> + vmovups _MOne+__svml_sacos_data_internal(%rip), %zmm8
> + vmovups _Two+__svml_sacos_data_internal(%rip), %zmm12
> + vmovups _sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
> + vmovaps %zmm0, %zmm4
> +
> +/* x = -|arg| */
> + vorps %zmm4, %zmm5, %zmm3
> + vandps %zmm4, %zmm5, %zmm2
> + vmovups _sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
> +
> +/* Y = 0.5 + 0.5*(-x) */
> + vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
> +
> +/* x^2 */
> + vmulps {rn-sae}, %zmm3, %zmm3, %zmm7
> + vrsqrt14ps %zmm6, %zmm10
> + vcmpps $17, {sae}, %zmm9, %zmm6, %k2
> + vcmpps $22, {sae}, %zmm3, %zmm8, %k0
> + vmovups _poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
> + vminps {sae}, %zmm6, %zmm7, %zmm1
> + vmovups _poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
> + vxorps %zmm10, %zmm10, %zmm10{%k2}
> + vaddps {rn-sae}, %zmm6, %zmm6, %zmm14
> + vmulps {rn-sae}, %zmm1, %zmm1, %zmm8
> + vmulps {rn-sae}, %zmm10, %zmm10, %zmm11
> + vmulps {rn-sae}, %zmm10, %zmm14, %zmm5
> + vcmpps $21, {sae}, %zmm6, %zmm1, %k1
> +
> +/* X<X^2 iff X<0 */
> + vcmpps $17, {sae}, %zmm1, %zmm4, %k3
> +
> +/* polynomial */
> + vmovups _poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
> + vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
> + vmovups _poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
> + vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
> + vmovups _poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
> + vmovups _Pi2H+__svml_sacos_data_internal(%rip), %zmm12
> + vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
> + vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
> + vmulps {rn-sae}, %zmm14, %zmm5, %zmm15
> + vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
> + vxorps %zmm12, %zmm12, %zmm12{%k1}
> + vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
> + vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
> + kmovw %k1, %eax
> + kmovw %k3, %ecx
> + kmovw %k0, %edx
> + vmulps {rn-sae}, %zmm1, %zmm11, %zmm13
> + vblendmps %zmm0, %zmm3, %zmm0{%k1}
> + vxorps %zmm2, %zmm0, %zmm1
> + andl %eax, %ecx
> + kmovw %ecx, %k2
> + vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
> + vorps _PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k2}
> + vaddps {rn-sae}, %zmm13, %zmm12, %zmm0
> + testl %edx, %edx
> +
> +/* Go to special inputs processing branch */
> + jne L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> + movq %rbp, %rsp
> + popq %rbp
> + cfi_def_cfa(7, 8)
> + cfi_restore(6)
> + ret
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> + vmovups %zmm4, 128(%rsp)
> + vmovups %zmm0, 192(%rsp)
> +
> +/* Go to exit */
> + je L(EXIT)
> + xorl %eax, %eax
> + vzeroupper
> + kmovw %k4, 24(%rsp)
> + kmovw %k5, 16(%rsp)
> + kmovw %k6, 8(%rsp)
> + kmovw %k7, (%rsp)
> + movq %rsi, 40(%rsp)
> + movq %rdi, 32(%rsp)
> + movq %r12, 64(%rsp)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> + movl %eax, %r12d
> + movq %r13, 56(%rsp)
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
> + movl %edx, %r13d
> + movq %r14, 48(%rsp)
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> + btl %r12d, %r13d
> +
> +/* Call scalar math function */
> + jc L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> + incl %r12d
> + cmpl $16, %r12d
> +
> +/* Check bits in range mask */
> + jl L(RANGEMASK_CHECK)
> + kmovw 24(%rsp), %k4
> + cfi_restore(122)
> + kmovw 16(%rsp), %k5
> + cfi_restore(123)
> + kmovw 8(%rsp), %k6
> + cfi_restore(124)
> + kmovw (%rsp), %k7
> + cfi_restore(125)
> + vmovups 192(%rsp), %zmm0
> + movq 40(%rsp), %rsi
> + cfi_restore(4)
> + movq 32(%rsp), %rdi
> + cfi_restore(5)
> + movq 64(%rsp), %r12
> + cfi_restore(12)
> + movq 56(%rsp), %r13
> + cfi_restore(13)
> + movq 48(%rsp), %r14
> + cfi_restore(14)
> +
> +/* Go to exit */
> + jmp L(EXIT)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> + movl %r12d, %r14d
> + movss 128(%rsp,%r14,4), %xmm0
> + call acosf@PLT
> + movss %xmm0, 192(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> + jmp L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVeN16v_acosf_skx)
> +
> + .section .rodata, "a"
> + .align 64
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(64)) VUINT32 SgnBit[16][1];
> + __declspec(align(64)) VUINT32 OneHalf[16][1];
> + __declspec(align(64)) VUINT32 sRsqrtMsk[16][1];
> + __declspec(align(64)) VUINT32 SmallNorm[16][1];
> + __declspec(align(64)) VUINT32 MOne[16][1];
> + __declspec(align(64)) VUINT32 HalfMask[16][1];
> + __declspec(align(64)) VUINT32 SQMask[16][1];
> + __declspec(align(64)) VUINT32 Two[16][1];
> + __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
> + __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
> + __declspec(align(64)) VUINT32 Pi2H[16][1];
> + __declspec(align(64)) VUINT32 Pi2L[16][1];
> + __declspec(align(64)) VUINT32 PiH[16][1];
> + __declspec(align(64)) VUINT32 PiL[16][1];
> + __declspec(align(64)) VUINT32 Zero[16][1];
> + __declspec(align(64)) VUINT32 SgnMask[16][1];
> + __declspec(align(64)) VUINT32 NanMask[16][1];
> + __declspec(align(64)) VUINT32 ep_coeff[3][16][1];
> +} __svml_sacos_data_internal_t;
> +#endif
> +__svml_sacos_data_internal:
> + /*== SgnBit ==*/
> + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> + /*== OneHalf ==*/
> + .align 64
> + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> + /*== sRsqrtMsk ==*/
> + .align 64
> + .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
> + /*== SmallNorm ==*/
> + .align 64
> + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> + /*== MOne ==*/
> + .align 64
> + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> + /*== HalfMask ==*/
> + .align 64
> + .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
> + /*== SQMask ==*/
> + .align 64
> + .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
> + /*== Two ==*/
> + .align 64
> + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
> + /*== sqrt_coeff[2] ==*/
> + .align 64
> + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> + /*== poly_coeff[5] ==*/
> + .align 64
> + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> + /*== Pi2H ==*/
> + .align 64
> + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> + /*== Pi2L ==*/
> + .align 64
> + .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
> + /*== PiH ==*/
> + .align 64
> + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> + /*== PiL ==*/
> + .align 64
> + .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
> + /*== zero ==*/
> + .align 64
> + .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
> + /*== sgn mask ==*/
> + .align 64
> + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> + /*== nan mask ==*/
> + .align 64
> + .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
> + /*== ep_coeff[3] ==*/
> + .align 64
> + .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
> + .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
> + .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
> + .align 64
> + .type __svml_sacos_data_internal,@object
> + .size __svml_sacos_data_internal,1600
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
> new file mode 100644
> index 0000000000..f94b3eb01a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
> @@ -0,0 +1,20 @@
> +/* SSE2 version of vectorized acosf, vector length is 4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2
> +#include "../svml_s_acosf4_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
> new file mode 100644
> index 0000000000..6f9a5c1082
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVbN4v_acosf
> +#include "ifunc-mathvec-sse4_1.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf,
> + __redirect__ZGVbN4v_acosf)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
> new file mode 100644
> index 0000000000..acfdc348aa
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
> @@ -0,0 +1,351 @@
> +/* Function acosf vectorized with SSE4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * SelMask = (|x| >= 0.5) ? 1 : 0;
> + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define _SgnBit 0
> +#define _OneHalf 64
> +#define _sRsqrtMsk 128
> +#define _SmallNorm 192
> +#define _MOne 256
> +#define _HalfMask 320
> +#define _SQMask 384
> +#define _Two 448
> +#define _sqrt_coeff 512
> +#define _poly_coeff 576
> +#define _Pi2H 704
> +#define _Pi2L 768
> +#define _PiH 832
> +#define _PiL 896
> +#define _Zero 960
> +#define _SgnMask 1024
> +#define _NanMask 1088
> +#define _ep_coeff 1152
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.sse4,"ax",@progbits
> +ENTRY(_ZGVbN4v_acosf_sse4)
> + pushq %rbp
> + cfi_def_cfa_offset(16)
> + movq %rsp, %rbp
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> + andq $-64, %rsp
> + subq $320, %rsp
> + movaps %xmm0, %xmm6
> +
> +/*
> + * 2*sqrt(X) ~ Sh - Sl (to 24+ bits)
> + * SQ ~ 2*sqrt(X)
> + */
> + movups __svml_sacos_data_internal(%rip), %xmm5
> + movups _OneHalf+__svml_sacos_data_internal(%rip), %xmm0
> +
> +/* x = -|arg| */
> + movaps %xmm5, %xmm7
> + orps %xmm6, %xmm7
> +
> +/* Y = 0.5 + 0.5*(-x) */
> + movaps %xmm0, %xmm2
> + mulps %xmm7, %xmm2
> +
> +/* x^2 */
> + movaps %xmm7, %xmm3
> + mulps %xmm7, %xmm3
> + addps %xmm2, %xmm0
> + movups _MOne+__svml_sacos_data_internal(%rip), %xmm4
> + andps %xmm6, %xmm5
> + cmpnleps %xmm7, %xmm4
> + minps %xmm0, %xmm3
> +
> +/* SQ ~ 2*sqrt(Y) */
> + rsqrtps %xmm0, %xmm1
> + movmskps %xmm4, %edx
> + movaps %xmm0, %xmm4
> + movaps %xmm3, %xmm2
> + movups %xmm8, 160(%rsp)
> + cmpltps _SmallNorm+__svml_sacos_data_internal(%rip), %xmm4
> + cmpnltps %xmm0, %xmm2
> + addps %xmm0, %xmm0
> + andnps %xmm1, %xmm4
> + movaps %xmm4, %xmm8
> + mulps %xmm4, %xmm8
> + mulps %xmm0, %xmm4
> + mulps %xmm8, %xmm0
> + movups _sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm1
> +
> +/* polynomial */
> + movups _poly_coeff+__svml_sacos_data_internal(%rip), %xmm8
> + mulps %xmm3, %xmm8
> + subps _Two+__svml_sacos_data_internal(%rip), %xmm0
> + mulps %xmm0, %xmm1
> + addps _poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm8
> + mulps %xmm4, %xmm0
> + addps _sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm1
> + mulps %xmm0, %xmm1
> + movaps %xmm3, %xmm0
> + mulps %xmm3, %xmm0
> + subps %xmm1, %xmm4
> + mulps %xmm0, %xmm8
> + movups _poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm1
> + andps %xmm2, %xmm4
> + mulps %xmm3, %xmm1
> + movups _PiH+__svml_sacos_data_internal(%rip), %xmm0
> + andps %xmm2, %xmm0
> + addps _poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm1
> + addps %xmm8, %xmm1
> +
> +/* X<X^2 iff X<0 */
> + movaps %xmm6, %xmm8
> + cmpltps %xmm3, %xmm8
> + mulps %xmm3, %xmm1
> + andps %xmm8, %xmm0
> + movaps %xmm2, %xmm8
> + andnps %xmm7, %xmm8
> + addps _poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm1
> + mulps %xmm3, %xmm1
> + orps %xmm4, %xmm8
> + pxor %xmm5, %xmm8
> + movaps %xmm2, %xmm3
> + mulps %xmm8, %xmm1
> + andnps _Pi2H+__svml_sacos_data_internal(%rip), %xmm3
> + addps %xmm1, %xmm8
> + addps %xmm3, %xmm0
> + addps %xmm8, %xmm0
> + testl %edx, %edx
> +
> +/* Go to special inputs processing branch */
> + jne L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> + movups 160(%rsp), %xmm8
> + cfi_restore(25)
> + movq %rbp, %rsp
> + popq %rbp
> + cfi_def_cfa(7, 8)
> + cfi_restore(6)
> + ret
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> + movups %xmm6, 192(%rsp)
> + movups %xmm0, 256(%rsp)
> + xorl %eax, %eax
> + movups %xmm9, 96(%rsp)
> + movups %xmm10, 80(%rsp)
> + movups %xmm11, 64(%rsp)
> + movups %xmm12, 48(%rsp)
> + movups %xmm13, 32(%rsp)
> + movups %xmm14, 16(%rsp)
> + movups %xmm15, (%rsp)
> + movq %rsi, 120(%rsp)
> + movq %rdi, 112(%rsp)
> + movq %r12, 144(%rsp)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> + movl %eax, %r12d
> + movq %r13, 136(%rsp)
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> + movl %edx, %r13d
> + movq %r14, 128(%rsp)
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> + btl %r12d, %r13d
> +
> +/* Call scalar math function */
> + jc L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> + incl %r12d
> + cmpl $4, %r12d
> +
> +/* Check bits in range mask */
> + jl L(RANGEMASK_CHECK)
> + movups 96(%rsp), %xmm9
> + cfi_restore(26)
> + movups 80(%rsp), %xmm10
> + cfi_restore(27)
> + movups 64(%rsp), %xmm11
> + cfi_restore(28)
> + movups 48(%rsp), %xmm12
> + cfi_restore(29)
> + movups 32(%rsp), %xmm13
> + cfi_restore(30)
> + movups 16(%rsp), %xmm14
> + cfi_restore(31)
> + movups (%rsp), %xmm15
> + cfi_restore(32)
> + movq 120(%rsp), %rsi
> + cfi_restore(4)
> + movq 112(%rsp), %rdi
> + cfi_restore(5)
> + movq 144(%rsp), %r12
> + cfi_restore(12)
> + movq 136(%rsp), %r13
> + cfi_restore(13)
> + movq 128(%rsp), %r14
> + cfi_restore(14)
> + movups 256(%rsp), %xmm0
> +
> +/* Go to exit */
> + jmp L(EXIT)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> + movl %r12d, %r14d
> + movss 192(%rsp,%r14,4), %xmm0
> + call acosf@PLT
> + movss %xmm0, 256(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> + jmp L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVbN4v_acosf_sse4)
> +
> + .section .rodata, "a"
> + .align 64
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(64)) VUINT32 SgnBit[4][1];
> + __declspec(align(64)) VUINT32 OneHalf[4][1];
> + __declspec(align(64)) VUINT32 sRsqrtMsk[4][1];
> + __declspec(align(64)) VUINT32 SmallNorm[4][1];
> + __declspec(align(64)) VUINT32 MOne[4][1];
> + __declspec(align(64)) VUINT32 HalfMask[4][1];
> + __declspec(align(64)) VUINT32 SQMask[4][1];
> + __declspec(align(64)) VUINT32 Two[4][1];
> + __declspec(align(64)) VUINT32 sqrt_coeff[2][4][1];
> + __declspec(align(64)) VUINT32 poly_coeff[5][4][1];
> + __declspec(align(64)) VUINT32 Pi2H[4][1];
> + __declspec(align(64)) VUINT32 Pi2L[4][1];
> + __declspec(align(64)) VUINT32 PiH[4][1];
> + __declspec(align(64)) VUINT32 PiL[4][1];
> + __declspec(align(64)) VUINT32 Zero[4][1];
> + __declspec(align(64)) VUINT32 SgnMask[4][1];
> + __declspec(align(64)) VUINT32 NanMask[4][1];
> + __declspec(align(64)) VUINT32 ep_coeff[3][4][1];
> +} __svml_sacos_data_internal_t;
> +#endif
> +__svml_sacos_data_internal:
Can the xmm/ymm/zmm versions a functions share the same internal data table?
> + /*== SgnBit ==*/
> + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
> + /*== OneHalf ==*/
> + .align 64
I think .align 32 here?
> + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> + /*== sRsqrtMsk ==*/
> + .align 64
> + .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
> + /*== SmallNorm ==*/
> + .align 64
> + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> + /*== MOne ==*/
> + .align 64
> + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> + /*== HalfMask ==*/
> + .align 64
> + .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
> + /*== SQMask ==*/
> + .align 64
> + .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
> + /*== Two ==*/
> + .align 64
> + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
> + /*== sqrt_coeff[2] ==*/
> + .align 64
> + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> + /*== poly_coeff[5] ==*/
> + .align 64
> + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> + /*== Pi2H ==*/
> + .align 64
> + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> + /*== Pi2L ==*/
> + .align 64
> + .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
> + /*== PiH ==*/
> + .align 64
> + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> + /*== PiL ==*/
> + .align 64
> + .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
> + /*== zero ==*/
> + .align 64
> + .long 0x00000000, 0x00000000, 0x00000000, 0x00000000
> + /*== sgn mask ==*/
> + .align 64
> + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
> + /*== nan mask ==*/
> + .align 64
> + .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
> + /*== ep_coeff[3] ==*/
> + .align 64
> + .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
> + .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
> + .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
> + .align 64
> + .type __svml_sacos_data_internal,@object
> + .size __svml_sacos_data_internal,1216
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
> new file mode 100644
> index 0000000000..583ef54fee
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
> @@ -0,0 +1,20 @@
> +/* SSE version of vectorized acosf, vector length is 8.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper
> +#include "../svml_s_acosf8_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
> new file mode 100644
> index 0000000000..dd360a9479
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 8.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVdN8v_acosf
> +#include "ifunc-mathvec-avx2.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf,
> + __redirect__ZGVdN8v_acosf)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> new file mode 100644
> index 0000000000..6d800f9aa4
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> @@ -0,0 +1,332 @@
> +/* Function acosf vectorized with AVX2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * SelMask = (|x| >= 0.5) ? 1 : 0;
> + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define _SgnBit 0
> +#define _OneHalf 64
> +#define _sRsqrtMsk 128
> +#define _SmallNorm 192
> +#define _MOne 256
> +#define _HalfMask 320
> +#define _SQMask 384
> +#define _Two 448
> +#define _sqrt_coeff 512
> +#define _poly_coeff 576
> +#define _Pi2H 768
> +#define _Pi2L 832
> +#define _PiH 896
> +#define _PiL 960
> +#define _Zero 1024
> +#define _SgnMask 1088
> +#define _NanMask 1152
> +#define _ep_coeff 1216
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.avx2,"ax",@progbits
> +ENTRY(_ZGVdN8v_acosf_avx2)
> + pushq %rbp
> + cfi_def_cfa_offset(16)
> + movq %rsp, %rbp
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> + andq $-64, %rsp
> + subq $384, %rsp
> +
> +/*
> + * 2*sqrt(X) ~ Sh - Sl (to 24+ bits)
> + * SQ ~ 2*sqrt(X)
> + */
> + vmovups __svml_sacos_data_internal(%rip), %ymm6
> + vmovups _OneHalf+__svml_sacos_data_internal(%rip), %ymm7
> + vmovups %ymm8, 288(%rsp)
> + vmovups %ymm15, 352(%rsp)
> + vmovups %ymm9, 96(%rsp)
> + vmovups _poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm15
> + vmovups %ymm10, 160(%rsp)
> + vmovups %ymm11, 192(%rsp)
> + vmovups %ymm14, 320(%rsp)
> + vmovups %ymm13, 256(%rsp)
> + vmovups %ymm12, 224(%rsp)
> + vmovaps %ymm0, %ymm5
> +
> +/* x = -|arg| */
> + vorps %ymm5, %ymm6, %ymm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> + vfmadd231ps %ymm4, %ymm7, %ymm7
> +
> +/* x^2 */
> + vmulps %ymm4, %ymm4, %ymm1
> + vcmpnge_uqps _MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm0
> +
> +/* SQ ~ 2*sqrt(Y) */
> + vaddps %ymm7, %ymm7, %ymm11
> + vminps %ymm7, %ymm1, %ymm2
> + vrsqrtps %ymm7, %ymm8
> + vfmadd213ps _poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm15
> + vmulps %ymm2, %ymm2, %ymm14
> + vcmpnlt_uqps %ymm7, %ymm2, %ymm1
> + vandps %ymm5, %ymm6, %ymm3
> + vcmplt_oqps _SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm6
> + vandnps %ymm8, %ymm6, %ymm9
> +
> +/* polynomial */
> + vmovups _poly_coeff+__svml_sacos_data_internal(%rip), %ymm6
> + vmulps %ymm9, %ymm9, %ymm10
> + vmulps %ymm11, %ymm9, %ymm13
> + vfmadd213ps _poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm6
> + vfmsub213ps _Two+__svml_sacos_data_internal(%rip), %ymm10, %ymm11
> + vfmadd213ps %ymm15, %ymm14, %ymm6
> + vmulps %ymm11, %ymm13, %ymm12
> + vfmadd213ps _poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm6
> + vmovmskps %ymm0, %edx
> + vmovups _sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
> + vmulps %ymm6, %ymm2, %ymm9
> +
> +/* X<X^2 iff X<0 */
> + vcmplt_oqps %ymm2, %ymm5, %ymm6
> + vfmadd213ps _sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm11, %ymm0
> + vfnmadd213ps %ymm13, %ymm12, %ymm0
> + vblendvps %ymm1, %ymm0, %ymm4, %ymm4
> + vxorps %ymm3, %ymm4, %ymm3
> + vandps _PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
> + vfmadd213ps %ymm3, %ymm3, %ymm9
> + vandps %ymm6, %ymm2, %ymm2
> + vandnps _Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm7
> + vaddps %ymm7, %ymm2, %ymm8
> + vaddps %ymm9, %ymm8, %ymm0
> + testl %edx, %edx
> +
> +/* Go to special inputs processing branch */
> + jne L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> + vmovups 288(%rsp), %ymm8
> + cfi_restore(91)
> + vmovups 96(%rsp), %ymm9
> + cfi_restore(92)
> + vmovups 160(%rsp), %ymm10
> + cfi_restore(93)
> + vmovups 192(%rsp), %ymm11
> + cfi_restore(94)
> + vmovups 224(%rsp), %ymm12
> + cfi_restore(95)
> + vmovups 256(%rsp), %ymm13
> + cfi_restore(96)
> + vmovups 320(%rsp), %ymm14
> + cfi_restore(97)
> + vmovups 352(%rsp), %ymm15
> + cfi_restore(98)
> + movq %rbp, %rsp
> + popq %rbp
> + cfi_def_cfa(7, 8)
> + cfi_restore(6)
> + ret
> + cfi_def_cfa(6, 16)
> + cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> + vmovups %ymm5, 64(%rsp)
> + vmovups %ymm0, 128(%rsp)
> +
> +/* Go to exit */
> + je L(EXIT)
> + xorl %eax, %eax
> + vzeroupper
> + movq %rsi, 8(%rsp)
> + movq %rdi, (%rsp)
> + movq %r12, 32(%rsp)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
> + movl %eax, %r12d
> + movq %r13, 24(%rsp)
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
> + movl %edx, %r13d
> + movq %r14, 16(%rsp)
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> + btl %r12d, %r13d
> +
> +/* Call scalar math function */
> + jc L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> + incl %r12d
> + cmpl $8, %r12d
> +
> +/* Check bits in range mask */
> + jl L(RANGEMASK_CHECK)
> + movq 8(%rsp), %rsi
> + cfi_restore(4)
> + movq (%rsp), %rdi
> + cfi_restore(5)
> + movq 32(%rsp), %r12
> + cfi_restore(12)
> + movq 24(%rsp), %r13
> + cfi_restore(13)
> + movq 16(%rsp), %r14
> + cfi_restore(14)
> + vmovups 128(%rsp), %ymm0
> +
> +/* Go to exit */
> + jmp L(EXIT)
> + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
> + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus) */
> + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> + movl %r12d, %r14d
> + movss 64(%rsp,%r14,4), %xmm0
> + call acosf@PLT
> + movss %xmm0, 128(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> + jmp L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVdN8v_acosf_avx2)
> +
> + .section .rodata, "a"
> + .align 64
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(64)) VUINT32 SgnBit[8][1];
> + __declspec(align(64)) VUINT32 OneHalf[8][1];
> + __declspec(align(64)) VUINT32 sRsqrtMsk[8][1];
> + __declspec(align(64)) VUINT32 SmallNorm[8][1];
> + __declspec(align(64)) VUINT32 MOne[8][1];
> + __declspec(align(64)) VUINT32 HalfMask[8][1];
> + __declspec(align(64)) VUINT32 SQMask[8][1];
> + __declspec(align(64)) VUINT32 Two[8][1];
> + __declspec(align(64)) VUINT32 sqrt_coeff[2][8][1];
> + __declspec(align(64)) VUINT32 poly_coeff[5][8][1];
> + __declspec(align(64)) VUINT32 Pi2H[8][1];
> + __declspec(align(64)) VUINT32 Pi2L[8][1];
> + __declspec(align(64)) VUINT32 PiH[8][1];
> + __declspec(align(64)) VUINT32 PiL[8][1];
> + __declspec(align(64)) VUINT32 Zero[8][1];
> + __declspec(align(64)) VUINT32 SgnMask[8][1];
> + __declspec(align(64)) VUINT32 NanMask[8][1];
> + __declspec(align(64)) VUINT32 ep_coeff[3][8][1];
> +} __svml_sacos_data_internal_t;
> +#endif
> +__svml_sacos_data_internal:
> + /*== SgnBit ==*/
> + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
Might be worth it to generate some of these constants without .rodata.
I.e 0x80000000 can be
vpcmpeqb %ymm0, %ymm0, %ymm0
vpsllq $63, %ymm0, %ymmDST
The same `vpcmpeqb` could be reused for many of these
> + /*== OneHalf ==*/
> + .align 64
> + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> + /*== sRsqrtMsk ==*/
> + .align 64
> + .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
> + /*== SmallNorm ==*/
> + .align 64
> + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> + /*== MOne ==*/
> + .align 64
> + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> + /*== HalfMask ==*/
> + .align 64
> + .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
> + /*== SQMask ==*/
> + .align 64
> + .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
> + /*== Two ==*/
> + .align 64
> + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
> + /*== sqrt_coeff[2] ==*/
> + .align 64
> + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> + /*== poly_coeff[5] ==*/
> + .align 64
> + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> + /*== Pi2H ==*/
> + .align 64
> + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> + /*== Pi2L ==*/
> + .align 64
> + .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
> + /*== PiH ==*/
> + .align 64
> + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> + /*== PiL ==*/
> + .align 64
> + .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
> + /*== zero ==*/
> + .align 64
> + .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
> + /*== sgn mask ==*/
> + .align 64
> + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> + /*== nan mask ==*/
> + .align 64
> + .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
> + /*== ep_coeff[3] ==*/
> + .align 64
> + .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
> + .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
> + .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
> + .align 64
> + .type __svml_sacos_data_internal,@object
> + .size __svml_sacos_data_internal,1344
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos2_core.S b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
> new file mode 100644
> index 0000000000..9656478b2d
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
> @@ -0,0 +1,29 @@
> +/* Function acos vectorized with SSE2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVbN2v_acos)
> +WRAPPER_IMPL_SSE2 acos
> +END (_ZGVbN2v_acos)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVbN2v_acos)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core.S b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
> new file mode 100644
> index 0000000000..e99cb4ae78
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
> @@ -0,0 +1,29 @@
> +/* Function acos vectorized with AVX2, wrapper version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVdN4v_acos)
> +WRAPPER_IMPL_AVX _ZGVbN2v_acos
> +END (_ZGVdN4v_acos)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVdN4v_acos)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
> new file mode 100644
> index 0000000000..7cbcbc965c
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
> @@ -0,0 +1,25 @@
> +/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVcN4v_acos)
> +WRAPPER_IMPL_AVX _ZGVbN2v_acos
> +END (_ZGVcN4v_acos)
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos8_core.S b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
> new file mode 100644
> index 0000000000..e26b30d81a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
> @@ -0,0 +1,25 @@
> +/* Function acos vectorized with AVX-512, wrapper to AVX2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVeN8v_acos)
> +WRAPPER_IMPL_AVX512 _ZGVdN4v_acos
> +END (_ZGVeN8v_acos)
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf16_core.S b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
> new file mode 100644
> index 0000000000..70e046d492
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
> @@ -0,0 +1,25 @@
> +/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVeN16v_acosf)
> +WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf
> +END (_ZGVeN16v_acosf)
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf4_core.S b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
> new file mode 100644
> index 0000000000..36354b32b5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
> @@ -0,0 +1,29 @@
> +/* Function acosf vectorized with SSE2, wrapper version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVbN4v_acosf)
> +WRAPPER_IMPL_SSE2 acosf
> +END (_ZGVbN4v_acosf)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVbN4v_acosf)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
> new file mode 100644
> index 0000000000..f08864a511
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
> @@ -0,0 +1,29 @@
> +/* Function acosf vectorized with AVX2, wrapper version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVdN8v_acosf)
> +WRAPPER_IMPL_AVX _ZGVbN4v_acosf
> +END (_ZGVdN8v_acosf)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVdN8v_acosf)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
> new file mode 100644
> index 0000000000..f3ed4d8e78
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
> @@ -0,0 +1,25 @@
> +/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVcN8v_acosf)
> +WRAPPER_IMPL_AVX _ZGVbN4v_acosf
> +END (_ZGVcN8v_acosf)
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
> new file mode 100644
> index 0000000000..e38b8ce821
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
> @@ -0,0 +1,3 @@
> +#define LIBMVEC_TYPE double
> +#define LIBMVEC_FUNC acos
> +#include "test-vector-abi-arg1.h"
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> index ed932fc98d..0abc7d2021 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
> VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
> VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
>
> #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> index 3a6e37044f..dda093b914 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
> VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
> VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
>
> #ifndef __ILP32__
> # define VEC_INT_TYPE __m256i
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> index 99db4e7616..f3230463bb 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
> VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
> VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
>
> #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> index 251d429ac0..cf9f52faf0 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
> VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
> VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
>
> #ifndef __ILP32__
> # define VEC_INT_TYPE __m512i
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
> new file mode 100644
> index 0000000000..fb47f974fd
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
> @@ -0,0 +1,3 @@
> +#define LIBMVEC_TYPE float
> +#define LIBMVEC_FUNC acosf
> +#include "test-vector-abi-arg1.h"
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> index c1d14cd79e..abbd3ed870 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
> VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
> VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
>
> #define VEC_INT_TYPE __m512i
>
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> index d23c372060..8a24027952 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
> VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
> VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
>
> #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> index 3152cffb0c..aff0442606 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
> VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
> VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
>
> /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
> #undef VECTOR_WRAPPER_fFF
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> index a8492abfef..913584d111 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
> VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
> VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
>
> #define VEC_INT_TYPE __m128i
>
> --
> 2.31.1
>
* Noah Goldstein via Libc-alpha:
> Is there a control flow from L(SPECIAL_VALUES_BRANCH) that doesn't go to exit?
> If not I think alot of the save/restores are pretty unnecissary.
It depends on the vector calling convention. It seems to be different
from the regular psABI calling convention, even for GPRs.
Thanks,
Florian
On Wed, Dec 15, 2021 at 1:57 PM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein via Libc-alpha:
>
> > Is there a control flow from L(SPECIAL_VALUES_BRANCH) that doesn't go to exit?
> > If not I think alot of the save/restores are pretty unnecissary.
>
> It depends on the vector calling convention. It seems to be different
> from the regular psABI calling convention, even for GPRs.
Unless I'm missing something this isn't a function (nor globally
visible). It's internal
and just about everything that is saved/restored has already been clobbered by
acos/acosf.
The save/restore is for restoring state internal to acos/acosf. Think
that the amount
of state that is being preserved is unnecessarily large.
>
> Thanks,
> Florian
>
On Wed, Dec 15, 2021 at 12:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Dec 15, 2021 at 1:57 PM Florian Weimer <fweimer@redhat.com> wrote:
> >
> > * Noah Goldstein via Libc-alpha:
> >
> > > Is there a control flow from L(SPECIAL_VALUES_BRANCH) that doesn't go to exit?
> > > If not I think alot of the save/restores are pretty unnecissary.
> >
> > It depends on the vector calling convention. It seems to be different
> > from the regular psABI calling convention, even for GPRs.
>
> Unless I'm missing something this isn't a function (nor globally
> visible). It's internal
> and just about everything that is saved/restored has already been clobbered by
> acos/acosf.
>
> The save/restore is for restoring state internal to acos/acosf. Think
> that the amount
> of state that is being preserved is unnecessarily large.
Thank you so much, we are working on this and will update in next version.
>
> >
> > Thanks,
> > Florian
> >
@@ -98,4 +98,15 @@
#define __DECL_SIMD_powf32x
#define __DECL_SIMD_powf64x
#define __DECL_SIMD_powf128x
+
+#define __DECL_SIMD_acos
+#define __DECL_SIMD_acosf
+#define __DECL_SIMD_acosl
+#define __DECL_SIMD_acosf16
+#define __DECL_SIMD_acosf32
+#define __DECL_SIMD_acosf64
+#define __DECL_SIMD_acosf128
+#define __DECL_SIMD_acosf32x
+#define __DECL_SIMD_acosf64x
+#define __DECL_SIMD_acosf128x
#endif
@@ -50,7 +50,7 @@
/* Trigonometric functions. */
/* Arc cosine of X. */
-__MATHCALL (acos,, (_Mdouble_ __x));
+__MATHCALL_VEC (acos,, (_Mdouble_ __x));
/* Arc sine of X. */
__MATHCALL (asin,, (_Mdouble_ __x));
/* Arc tangent of X. */
@@ -46,3 +46,11 @@ GLIBC_2.22 _ZGVeN8v_log F
GLIBC_2.22 _ZGVeN8v_sin F
GLIBC_2.22 _ZGVeN8vv_pow F
GLIBC_2.22 _ZGVeN8vvv_sincos F
+GLIBC_2.35 _ZGVbN2v_acos F
+GLIBC_2.35 _ZGVbN4v_acosf F
+GLIBC_2.35 _ZGVcN4v_acos F
+GLIBC_2.35 _ZGVcN8v_acosf F
+GLIBC_2.35 _ZGVdN4v_acos F
+GLIBC_2.35 _ZGVdN8v_acosf F
+GLIBC_2.35 _ZGVeN16v_acosf F
+GLIBC_2.35 _ZGVeN8v_acos F
@@ -58,6 +58,10 @@
# define __DECL_SIMD_pow __DECL_SIMD_x86_64
# undef __DECL_SIMD_powf
# define __DECL_SIMD_powf __DECL_SIMD_x86_64
+# undef __DECL_SIMD_acos
+# define __DECL_SIMD_acos __DECL_SIMD_x86_64
+# undef __DECL_SIMD_acosf
+# define __DECL_SIMD_acosf __DECL_SIMD_x86_64
# endif
#endif
@@ -28,6 +28,8 @@
!GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@@ -41,3 +43,5 @@
!GCC$ builtin (expf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (pow) attributes simd (notinbranch) if('x32')
!GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
@@ -22,6 +22,7 @@ postclean-generated += libmvec.mk
# Define for both math and mathvec directories.
libmvec-funcs = \
+ acos \
cos \
exp \
log \
@@ -12,5 +12,9 @@ libmvec {
_ZGVbN4v_expf; _ZGVcN8v_expf; _ZGVdN8v_expf; _ZGVeN16v_expf;
_ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf;
_ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf;
+}
+ GLIBC_2.35 {
+ _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
+ _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
}
}
@@ -25,6 +25,26 @@ float: 1
float128: 1
ldouble: 2
+Function: "acos_vlen16":
+float: 1
+
+Function: "acos_vlen2":
+double: 1
+
+Function: "acos_vlen4":
+double: 1
+float: 2
+
+Function: "acos_vlen4_avx2":
+double: 1
+
+Function: "acos_vlen8":
+double: 1
+float: 2
+
+Function: "acos_vlen8_avx2":
+float: 1
+
Function: "acosh":
double: 2
float: 2
new file mode 100644
@@ -0,0 +1,39 @@
+/* Common definition for libmathvec ifunc selections optimized with
+ AVX512.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+#undef PASTER2
+#define PASTER2(x,y) x##_##y
+
+extern void REDIRECT_NAME (void);
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ))
+ return OPTIMIZE (skx);
+
+ return OPTIMIZE (avx2_wrapper);
+}
new file mode 100644
@@ -0,0 +1,20 @@
+/* SSE2 version of vectorized acos, vector length is 2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2
+#include "../svml_d_acos2_core.S"
new file mode 100644
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized acos, vector length is 2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVbN2v_acos
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,399 @@
+/* Function acos vectorized with SSE4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * SelMask = (|x| >= 0.5) ? 1 : 0;
+ * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define _SgnBit 0
+#define _OneHalf 64
+#define _SmallNorm 128
+#define _dRsqrtMsk 192
+#define _MOne 256
+#define _HalfMask 320
+#define _Two 384
+#define _sqrt_coeff 448
+#define _poly_coeff 512
+#define _PiL 704
+#define _PiH 768
+#define _Pi2L 832
+#define _Pi2H 896
+#define _Zero 960
+#define _SgnMask 1024
+#define _NanMask 1088
+#define _ep_coeff 1152
+#define _dInfs 1280
+#define _dOnes 1344
+#define _dZeros 1408
+
+#include <sysdep.h>
+
+ .text
+ .section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN2v_acos_sse4)
+ pushq %rbp
+ cfi_def_cfa_offset(16)
+ movq %rsp, %rbp
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+ andq $-64, %rsp
+ subq $256, %rsp
+ movups __svml_dacos_data_internal(%rip), %xmm3
+ movups _OneHalf+__svml_dacos_data_internal(%rip), %xmm6
+
+/* x = -|arg| */
+ movaps %xmm3, %xmm2
+ orps %xmm0, %xmm2
+
+/* Y = 0.5 + 0.5*(-x) */
+ movaps %xmm6, %xmm4
+ mulpd %xmm2, %xmm4
+ addpd %xmm4, %xmm6
+
+/* S ~ 2*sqrt(Y) */
+ cvtpd2ps %xmm6, %xmm7
+
+/* NaN processed in special branch (so wind test passed) */
+ movups _MOne+__svml_dacos_data_internal(%rip), %xmm1
+
+/* x^2 */
+ movaps %xmm2, %xmm5
+ cmpnlepd %xmm2, %xmm1
+ mulpd %xmm2, %xmm5
+ movmskpd %xmm1, %edx
+ movlhps %xmm7, %xmm7
+ andps %xmm0, %xmm3
+ movups %xmm8, 144(%rsp)
+ rsqrtps %xmm7, %xmm1
+ minpd %xmm6, %xmm5
+ cvtps2pd %xmm1, %xmm8
+ movaps %xmm6, %xmm1
+ movaps %xmm5, %xmm4
+ cmpltpd _SmallNorm+__svml_dacos_data_internal(%rip), %xmm1
+ cmpnltpd %xmm6, %xmm4
+ addpd %xmm6, %xmm6
+ andnps %xmm8, %xmm1
+ movups %xmm14, 176(%rsp)
+ movaps %xmm1, %xmm14
+ mulpd %xmm1, %xmm14
+ mulpd %xmm6, %xmm1
+ mulpd %xmm14, %xmm6
+ subpd _Two+__svml_dacos_data_internal(%rip), %xmm6
+ movups %xmm15, 160(%rsp)
+ movaps %xmm6, %xmm8
+ movups _sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm15
+ mulpd %xmm6, %xmm15
+ mulpd %xmm1, %xmm8
+ addpd _sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm15
+ mulpd %xmm6, %xmm15
+ addpd _sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm15
+ mulpd %xmm15, %xmm6
+ addpd _sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6
+ mulpd %xmm6, %xmm8
+
+/* polynomial */
+ movups _poly_coeff+__svml_dacos_data_internal(%rip), %xmm6
+ movaps %xmm5, %xmm15
+ mulpd %xmm5, %xmm6
+ mulpd %xmm5, %xmm15
+ addpd _poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm6
+ subpd %xmm8, %xmm1
+ mulpd %xmm15, %xmm6
+ movups _poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm14
+ andps %xmm4, %xmm1
+ mulpd %xmm5, %xmm14
+ movups _poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm7
+ mulpd %xmm5, %xmm7
+ addpd _poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm14
+ addpd _poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm7
+ addpd %xmm6, %xmm14
+ mulpd %xmm15, %xmm7
+ movups _poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm8
+ movaps %xmm15, %xmm6
+ mulpd %xmm5, %xmm8
+ mulpd %xmm15, %xmm6
+ addpd _poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm8
+ mulpd %xmm6, %xmm14
+ addpd %xmm7, %xmm8
+ movups _poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm7
+ mulpd %xmm5, %xmm7
+ addpd %xmm14, %xmm8
+ addpd _poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm7
+ mulpd %xmm15, %xmm8
+ movups _poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm6
+
+/* X<X^2 iff X<0 */
+ movaps %xmm0, %xmm14
+ addpd %xmm8, %xmm7
+ cmpltpd %xmm5, %xmm14
+ mulpd %xmm5, %xmm6
+ mulpd %xmm7, %xmm15
+ addpd _poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm6
+ addpd %xmm15, %xmm6
+ mulpd %xmm5, %xmm6
+ movaps %xmm4, %xmm7
+ movaps %xmm4, %xmm5
+ andnps %xmm2, %xmm7
+ orps %xmm1, %xmm7
+ pxor %xmm3, %xmm7
+ mulpd %xmm7, %xmm6
+ movups _PiH+__svml_dacos_data_internal(%rip), %xmm8
+ andps %xmm4, %xmm8
+ andnps _Pi2H+__svml_dacos_data_internal(%rip), %xmm5
+ andps %xmm14, %xmm8
+ addpd %xmm5, %xmm8
+ addpd %xmm6, %xmm7
+ addpd %xmm7, %xmm8
+ testl %edx, %edx
+
+/* Go to special inputs processing branch */
+ jne L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+ movups 176(%rsp), %xmm14
+ cfi_restore(31)
+ movaps %xmm8, %xmm0
+ movups 144(%rsp), %xmm8
+ cfi_restore(25)
+ movups 160(%rsp), %xmm15
+ cfi_restore(32)
+ movq %rbp, %rsp
+ popq %rbp
+ cfi_def_cfa(7, 8)
+ cfi_restore(6)
+ ret
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+ movups %xmm0, 128(%rsp)
+ movups %xmm8, 192(%rsp)
+ xorl %eax, %eax
+ movups %xmm9, 64(%rsp)
+ movups %xmm10, 48(%rsp)
+ movups %xmm11, 32(%rsp)
+ movups %xmm12, 16(%rsp)
+ movups %xmm13, (%rsp)
+ movq %rsi, 88(%rsp)
+ movq %rdi, 80(%rsp)
+ movq %r12, 112(%rsp)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -144; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x70, 0xff, 0xff, 0xff, 0x22
+ movl %eax, %r12d
+ movq %r13, 104(%rsp)
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -152; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x68, 0xff, 0xff, 0xff, 0x22
+ movl %edx, %r13d
+ movq %r14, 96(%rsp)
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -160; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x60, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+ btl %r12d, %r13d
+
+/* Call scalar math function */
+ jc L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+ incl %r12d
+ cmpl $2, %r12d
+
+/* Check bits in range mask */
+ jl L(RANGEMASK_CHECK)
+ movups 64(%rsp), %xmm9
+ cfi_restore(26)
+ movups 48(%rsp), %xmm10
+ cfi_restore(27)
+ movups 32(%rsp), %xmm11
+ cfi_restore(28)
+ movups 16(%rsp), %xmm12
+ cfi_restore(29)
+ movups (%rsp), %xmm13
+ cfi_restore(30)
+ movq 88(%rsp), %rsi
+ cfi_restore(4)
+ movq 80(%rsp), %rdi
+ cfi_restore(5)
+ movq 112(%rsp), %r12
+ cfi_restore(12)
+ movq 104(%rsp), %r13
+ cfi_restore(13)
+ movq 96(%rsp), %r14
+ cfi_restore(14)
+ movups 192(%rsp), %xmm8
+
+/* Go to exit */
+ jmp L(EXIT)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -144; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x70, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -152; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x68, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -160; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x60, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+ movl %r12d, %r14d
+ movsd 128(%rsp,%r14,8), %xmm0
+ call acos@PLT
+ movsd %xmm0, 192(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+ jmp L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVbN2v_acos_sse4)
+
+ .section .rodata, "a"
+ .align 64
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(64)) VUINT32 SgnBit[2][2];
+ __declspec(align(64)) VUINT32 OneHalf[2][2];
+ __declspec(align(64)) VUINT32 SmallNorm[2][2];
+ __declspec(align(64)) VUINT32 dRsqrtMsk[2][2];
+ __declspec(align(64)) VUINT32 MOne[2][2];
+ __declspec(align(64)) VUINT32 HalfMask[2][2];
+ __declspec(align(64)) VUINT32 Two[2][2];
+ __declspec(align(64)) VUINT32 sqrt_coeff[4][2][2];
+ __declspec(align(64)) VUINT32 poly_coeff[12][2][2];
+ __declspec(align(64)) VUINT32 PiL[2][2];
+ __declspec(align(64)) VUINT32 PiH[2][2];
+ __declspec(align(64)) VUINT32 Pi2L[2][2];
+ __declspec(align(64)) VUINT32 Pi2H[2][2];
+ __declspec(align(64)) VUINT32 Zero[2][2];
+ __declspec(align(64)) VUINT32 SgnMask[2][2];
+ __declspec(align(64)) VUINT32 NanMask[2][2];
+ __declspec(align(64)) VUINT32 ep_coeff[6][2][2];
+ /* scalar part follow */
+ __declspec(align(64)) VUINT32 dInfs[2][2];
+ __declspec(align(64)) VUINT32 dOnes[2][2];
+ __declspec(align(64)) VUINT32 dZeros[2][2];
+} __svml_dacos_data_internal_t;
+#endif
+__svml_dacos_data_internal:
+ /*== SgnBit ==*/
+ .quad 0x8000000000000000, 0x8000000000000000
+ /*== OneHalf ==*/
+ .align 64
+ .quad 0x3fe0000000000000, 0x3fe0000000000000
+ /*== SmallNorm ==*/
+ .align 64
+ .quad 0x3000000000000000, 0x3000000000000000
+ /*== dRsqrtMsk ==*/
+ .align 64
+ .quad 0xffffff0000000000, 0xffffff0000000000
+ /*== MOne ==*/
+ .align 64
+ .quad 0xbff0000000000000, 0xbff0000000000000
+ /*== HalfMask ==*/
+ .align 64
+ .quad 0xfffffffffc000000, 0xfffffffffc000000
+ /*== Two ==*/
+ .align 64
+ .quad 0x4000000000000000, 0x4000000000000000
+ /*== sqrt_coeff[4] ==*/
+ .align 64
+ .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+ .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+ .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+ .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+ /*== poly_coeff[12] ==*/
+ .align 64
+ .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+ .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+ .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+ .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+ .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+ .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+ .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+ .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+ .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+ .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+ .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+ .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+ /*== PiL ==*/
+ .align 64
+ .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07
+ /*== PiH ==*/
+ .align 64
+ .quad 0x400921fb54442d18, 0x400921fb54442d18
+ /*== Pi2L ==*/
+ .align 64
+ .quad 0x3c91a62633145c07, 0x3c91a62633145c07
+ /*== Pi2H ==*/
+ .align 64
+ .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
+ /*== Zero ==*/
+ .align 64
+ .quad 0x0000000000000000, 0x0000000000000000
+ /*== sgn mask ==*/
+ .align 64
+ .quad 0x8000000000000000, 0x8000000000000000
+ /*== NaN mask ==*/
+ .align 64
+ .quad 0xfffc000000000000, 0xfffc000000000000
+ /*== ep_coeff[6] ==*/
+ .align 64
+ .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
+ .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
+ .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
+ .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
+ .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
+ .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
+ /* scalar part follows */
+ /*== dInfs = DP infinity, +/- ==*/
+ .align 64
+ .quad 0x7ff0000000000000, 0xfff0000000000000
+ /*== dOnes = DP one, +/- ==*/
+ .align 64
+ .quad 0x3ff0000000000000, 0xbff0000000000000
+ /*== dZeros = DP zero +/- ==*/
+ .align 64
+ .quad 0x0000000000000000, 0x8000000000000000
+ .align 64
+ .type __svml_dacos_data_internal,@object
+ .size __svml_dacos_data_internal,1472
new file mode 100644
@@ -0,0 +1,20 @@
+/* SSE version of vectorized acos, vector length is 4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper
+#include "../svml_d_acos4_core.S"
new file mode 100644
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized acos, vector length is 4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVdN4v_acos
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,368 @@
+/* Function acos vectorized with AVX2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * SelMask = (|x| >= 0.5) ? 1 : 0;
+ * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define _SgnBit 0
+#define _OneHalf 64
+#define _SmallNorm 128
+#define _dRsqrtMsk 192
+#define _MOne 256
+#define _HalfMask 320
+#define _Two 384
+#define _sqrt_coeff 448
+#define _poly_coeff 576
+#define _PiL 960
+#define _PiH 1024
+#define _Pi2L 1088
+#define _Pi2H 1152
+#define _Zero 1216
+#define _SgnMask 1280
+#define _NanMask 1344
+#define _ep_coeff 1408
+#define _dInfs 1600
+#define _dOnes 1664
+#define _dZeros 1728
+
+#include <sysdep.h>
+
+ .text
+ .section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN4v_acos_avx2)
+ pushq %rbp
+ cfi_def_cfa_offset(16)
+ movq %rsp, %rbp
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+ andq $-64, %rsp
+ subq $384, %rsp
+ vmovupd __svml_dacos_data_internal(%rip), %ymm6
+ vmovupd _OneHalf+__svml_dacos_data_internal(%rip), %ymm7
+ vmovups %ymm8, 96(%rsp)
+ vmovups %ymm10, 192(%rsp)
+ vmovups %ymm9, 160(%rsp)
+ vmovups %ymm11, 224(%rsp)
+ vmovups %ymm12, 256(%rsp)
+ vmovups %ymm13, 288(%rsp)
+ vmovups %ymm15, 352(%rsp)
+ vmovups %ymm14, 320(%rsp)
+ vmovapd %ymm0, %ymm5
+
+/* x = -|arg| */
+ vorpd %ymm5, %ymm6, %ymm4
+
+/* Y = 0.5 + 0.5*(-x) */
+ vfmadd231pd %ymm4, %ymm7, %ymm7
+
+/* x^2 */
+ vmulpd %ymm4, %ymm4, %ymm1
+
+/* NaN processed in special branch (so wind test passed) */
+ vcmpnge_uqpd _MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm0
+
+/* S ~ 2*sqrt(Y) */
+ vcmplt_oqpd _SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm9
+ vaddpd %ymm7, %ymm7, %ymm13
+ vminpd %ymm7, %ymm1, %ymm2
+ vandpd %ymm5, %ymm6, %ymm3
+ vcvtpd2ps %ymm7, %xmm6
+ vcmpnlt_uqpd %ymm7, %ymm2, %ymm1
+ vmovupd _poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm7
+ vrsqrtps %xmm6, %xmm8
+ vmovupd _poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm6
+ vfmadd213pd _poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm7
+ vcvtps2pd %xmm8, %ymm10
+ vfmadd213pd _poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm6
+ vandnpd %ymm10, %ymm9, %ymm11
+ vmulpd %ymm11, %ymm11, %ymm12
+ vmulpd %ymm13, %ymm11, %ymm15
+ vmovupd _poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm9
+ vmulpd %ymm2, %ymm2, %ymm11
+ vmovupd _poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm10
+ vfmsub213pd _Two+__svml_dacos_data_internal(%rip), %ymm12, %ymm13
+ vmovupd _poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm12
+ vfmadd213pd _poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm9
+ vmulpd %ymm11, %ymm11, %ymm8
+ vfmadd213pd _poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm10
+ vmulpd %ymm13, %ymm15, %ymm14
+ vfmadd213pd _poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm12
+ vfmadd213pd %ymm7, %ymm11, %ymm9
+ vmovmskpd %ymm0, %edx
+ vmovupd _sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0
+ vfmadd213pd _sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
+ vfmadd213pd _sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
+ vfmadd213pd _sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
+
+/* polynomial */
+ vmovupd _poly_coeff+__svml_dacos_data_internal(%rip), %ymm13
+ vfnmadd213pd %ymm15, %ymm14, %ymm0
+ vfmadd213pd _poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm13
+ vblendvpd %ymm1, %ymm0, %ymm4, %ymm4
+ vfmadd213pd %ymm6, %ymm11, %ymm13
+
+/* X<X^2 iff X<0 */
+ vcmplt_oqpd %ymm2, %ymm5, %ymm6
+ vfmadd213pd %ymm9, %ymm8, %ymm13
+ vfmadd213pd %ymm10, %ymm11, %ymm13
+ vfmadd213pd %ymm12, %ymm11, %ymm13
+ vmulpd %ymm13, %ymm2, %ymm14
+ vxorpd %ymm3, %ymm4, %ymm3
+ vandpd _PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2
+ vfmadd213pd %ymm3, %ymm3, %ymm14
+ vandpd %ymm6, %ymm2, %ymm2
+ vandnpd _Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7
+ vaddpd %ymm7, %ymm2, %ymm8
+ vaddpd %ymm14, %ymm8, %ymm0
+ testl %edx, %edx
+
+/* Go to special inputs processing branch */
+ jne L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+ vmovups 96(%rsp), %ymm8
+ cfi_restore(91)
+ vmovups 160(%rsp), %ymm9
+ cfi_restore(92)
+ vmovups 192(%rsp), %ymm10
+ cfi_restore(93)
+ vmovups 224(%rsp), %ymm11
+ cfi_restore(94)
+ vmovups 256(%rsp), %ymm12
+ cfi_restore(95)
+ vmovups 288(%rsp), %ymm13
+ cfi_restore(96)
+ vmovups 320(%rsp), %ymm14
+ cfi_restore(97)
+ vmovups 352(%rsp), %ymm15
+ cfi_restore(98)
+ movq %rbp, %rsp
+ popq %rbp
+ cfi_def_cfa(7, 8)
+ cfi_restore(6)
+ ret
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+ vmovupd %ymm5, 64(%rsp)
+ vmovupd %ymm0, 128(%rsp)
+
+/* Go to exit */
+ je L(EXIT)
+ xorl %eax, %eax
+ vzeroupper
+ movq %rsi, 8(%rsp)
+ movq %rdi, (%rsp)
+ movq %r12, 32(%rsp)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
+ movl %eax, %r12d
+ movq %r13, 24(%rsp)
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
+ movl %edx, %r13d
+ movq %r14, 16(%rsp)
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+ btl %r12d, %r13d
+
+/* Call scalar math function */
+ jc L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+ incl %r12d
+ cmpl $4, %r12d
+
+/* Check bits in range mask */
+ jl L(RANGEMASK_CHECK)
+ movq 8(%rsp), %rsi
+ cfi_restore(4)
+ movq (%rsp), %rdi
+ cfi_restore(5)
+ movq 32(%rsp), %r12
+ cfi_restore(12)
+ movq 24(%rsp), %r13
+ cfi_restore(13)
+ movq 16(%rsp), %r14
+ cfi_restore(14)
+ vmovupd 128(%rsp), %ymm0
+
+/* Go to exit */
+ jmp L(EXIT)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+ movl %r12d, %r14d
+ movsd 64(%rsp,%r14,8), %xmm0
+ call acos@PLT
+ movsd %xmm0, 128(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+ jmp L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVdN4v_acos_avx2)
+
+ .section .rodata, "a"
+ .align 64
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(64)) VUINT32 SgnBit[4][2];
+ __declspec(align(64)) VUINT32 OneHalf[4][2];
+ __declspec(align(64)) VUINT32 SmallNorm[4][2];
+ __declspec(align(64)) VUINT32 dRsqrtMsk[4][2];
+ __declspec(align(64)) VUINT32 MOne[4][2];
+ __declspec(align(64)) VUINT32 HalfMask[4][2];
+ __declspec(align(64)) VUINT32 Two[4][2];
+ __declspec(align(64)) VUINT32 sqrt_coeff[4][4][2];
+ __declspec(align(64)) VUINT32 poly_coeff[12][4][2];
+ __declspec(align(64)) VUINT32 PiL[4][2];
+ __declspec(align(64)) VUINT32 PiH[4][2];
+ __declspec(align(64)) VUINT32 Pi2L[4][2];
+ __declspec(align(64)) VUINT32 Pi2H[4][2];
+ __declspec(align(64)) VUINT32 Zero[4][2];
+ __declspec(align(64)) VUINT32 SgnMask[4][2];
+ __declspec(align(64)) VUINT32 NanMask[4][2];
+ __declspec(align(64)) VUINT32 ep_coeff[6][4][2];
+ /* scalar part follow */
+ __declspec(align(64)) VUINT32 dInfs[2][2];
+ __declspec(align(64)) VUINT32 dOnes[2][2];
+ __declspec(align(64)) VUINT32 dZeros[2][2];
+} __svml_dacos_data_internal_t;
+#endif
+__svml_dacos_data_internal:
+ /*== SgnBit ==*/
+ .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+ /*== OneHalf ==*/
+ .align 64
+ .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
+ /*== SmallNorm ==*/
+ .align 64
+ .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
+ /*== dRsqrtMsk ==*/
+ .align 64
+ .quad 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000
+ /*== MOne ==*/
+ .align 64
+ .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+ /*== HalfMask ==*/
+ .align 64
+ .quad 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000
+ /*== Two ==*/
+ .align 64
+ .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
+ /*== sqrt_coeff[4] ==*/
+ .align 64
+ .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+ .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+ .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+ .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+ /*== poly_coeff[12] ==*/
+ .align 64
+ .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+ .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+ .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+ .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+ .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+ .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+ .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+ .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+ .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+ .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+ .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+ .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+ /*== PiL ==*/
+ .align 64
+ .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07
+ /*== PiH ==*/
+ .align 64
+ .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
+ /*== Pi2L ==*/
+ .align 64
+ .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
+ /*== Pi2H ==*/
+ .align 64
+ .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+ /*== Zero ==*/
+ .align 64
+ .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ /*== sgn mask ==*/
+ .align 64
+ .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+ /*== NaN mask ==*/
+ .align 64
+ .quad 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000
+ /*== ep_coeff[6] ==*/
+ .align 64
+ .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
+ .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
+ .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
+ .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
+ .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
+ .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
+ /* scalar part follows */
+ /*== dInfs = DP infinity, +/- ==*/
+ .align 64
+ .quad 0x7ff0000000000000, 0xfff0000000000000
+ /*== dOnes = DP one, +/- ==*/
+ .align 64
+ .quad 0x3ff0000000000000, 0xbff0000000000000
+ /*== dZeros = DP zero +/- ==*/
+ .align 64
+ .quad 0x0000000000000000, 0x8000000000000000
+ .align 64
+ .type __svml_dacos_data_internal,@object
+ .size __svml_dacos_data_internal,1792
new file mode 100644
@@ -0,0 +1,20 @@
+/* AVX2 version of vectorized acos, vector length is 8.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper
+#include "../svml_d_acos8_core.S"
new file mode 100644
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized acos, vector length is 8.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVeN8v_acos
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,386 @@
+/* Function acos vectorized with AVX-512.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * SelMask = (|x| >= 0.5) ? 1 : 0;
+ * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define _SgnBit 0
+#define _OneHalf 64
+#define _SmallNorm 128
+#define _dRsqrtMsk 192
+#define _MOne 256
+#define _HalfMask 320
+#define _Two 384
+#define _sqrt_coeff_1 448
+#define _sqrt_coeff_2 512
+#define _sqrt_coeff_3 576
+#define _sqrt_coeff_4 640
+#define _poly_coeff_1 704
+#define _poly_coeff_2 768
+#define _poly_coeff_3 832
+#define _poly_coeff_4 896
+#define _poly_coeff_5 960
+#define _poly_coeff_6 1024
+#define _poly_coeff_7 1088
+#define _poly_coeff_8 1152
+#define _poly_coeff_9 1216
+#define _poly_coeff_10 1280
+#define _poly_coeff_11 1344
+#define _poly_coeff_12 1408
+#define _PiL 1472
+#define _PiH 1536
+#define _Pi2L 1600
+#define _Pi2H 1664
+#define _Zero 1728
+#define _SgnMask 1792
+#define _NanMask 1856
+#define _ep_coeff_1 1920
+#define _ep_coeff_2 1984
+#define _ep_coeff_3 2048
+#define _ep_coeff_4 2112
+#define _ep_coeff_5 2176
+#define _ep_coeff_6 2240
+#define _dInfs 2304
+#define _dOnes 2368
+#define _dZeros 2432
+
+#include <sysdep.h>
+
+ .text
+ .section .text.evex512,"ax",@progbits
+ENTRY(_ZGVeN8v_acos_skx)
+ pushq %rbp
+ cfi_def_cfa_offset(16)
+ movq %rsp, %rbp
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+ andq $-64, %rsp
+ subq $256, %rsp
+ vmovups __svml_dacos_data_internal(%rip), %zmm7
+ vmovups _OneHalf+__svml_dacos_data_internal(%rip), %zmm8
+
+/* S ~ 2*sqrt(Y) */
+ vmovups _SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
+ vmovups _Two+__svml_dacos_data_internal(%rip), %zmm14
+ vmovups _sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
+ vmovups _sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
+ vmovups _sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
+ vmovups _MOne+__svml_dacos_data_internal(%rip), %zmm10
+ vmovaps %zmm0, %zmm6
+
+/* x = -|arg| */
+ vorpd %zmm6, %zmm7, %zmm5
+ vandpd %zmm6, %zmm7, %zmm4
+
+/* Y = 0.5 + 0.5*(-x) */
+ vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
+
+/* x^2 */
+ vmulpd {rn-sae}, %zmm5, %zmm5, %zmm9
+ vrsqrt14pd %zmm8, %zmm12
+ vcmppd $17, {sae}, %zmm11, %zmm8, %k2
+ vcmppd $17, {sae}, %zmm10, %zmm5, %k0
+ vmovups _poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
+ vmovups _poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
+ vminpd {sae}, %zmm8, %zmm9, %zmm3
+ vmovups _poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
+ vxorpd %zmm12, %zmm12, %zmm12{%k2}
+ vaddpd {rn-sae}, %zmm8, %zmm8, %zmm0
+ vcmppd $21, {sae}, %zmm8, %zmm3, %k1
+
+/* X<X^2 iff X<0 */
+ vcmppd $17, {sae}, %zmm3, %zmm6, %k3
+ vmulpd {rn-sae}, %zmm12, %zmm12, %zmm13
+ vmulpd {rn-sae}, %zmm12, %zmm0, %zmm7
+ vmovups _poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
+
+/* polynomial */
+ vmovups _poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
+ vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
+ vmovups _sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
+ vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
+ vmovups _poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
+ vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
+ vmovups _poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
+ vmulpd {rn-sae}, %zmm0, %zmm7, %zmm14
+ vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
+ vmovups _poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
+ kmovw %k1, %eax
+ kmovw %k3, %ecx
+ kmovw %k0, %edx
+ vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
+ vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
+ vmovups _poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
+ vmulpd {rn-sae}, %zmm3, %zmm3, %zmm0
+ vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
+ vmovups _poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
+ vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
+ vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
+ vblendmpd %zmm2, %zmm5, %zmm2{%k1}
+ vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
+ vmovups _poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
+ vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
+ andl %eax, %ecx
+ vmovups _poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
+ kmovw %ecx, %k2
+ vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
+ vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
+ vmulpd {rn-sae}, %zmm0, %zmm0, %zmm10
+ vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
+ vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
+ vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
+ vmovups _Pi2H+__svml_dacos_data_internal(%rip), %zmm0
+ vmulpd {rn-sae}, %zmm3, %zmm1, %zmm1
+ vxorpd %zmm4, %zmm2, %zmm3
+ vxorpd %zmm0, %zmm0, %zmm0{%k1}
+ vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
+ vorpd _PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k2}
+ vaddpd {rn-sae}, %zmm1, %zmm0, %zmm0
+ testl %edx, %edx
+
+/* Go to special inputs processing branch */
+ jne L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+ movq %rbp, %rsp
+ popq %rbp
+ cfi_def_cfa(7, 8)
+ cfi_restore(6)
+ ret
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+ vmovups %zmm6, 128(%rsp)
+ vmovups %zmm0, 192(%rsp)
+
+/* Go to exit */
+ je L(EXIT)
+ xorl %eax, %eax
+ vzeroupper
+ kmovw %k4, 24(%rsp)
+ kmovw %k5, 16(%rsp)
+ kmovw %k6, 8(%rsp)
+ kmovw %k7, (%rsp)
+ movq %rsi, 40(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r12, 64(%rsp)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+ movl %eax, %r12d
+ movq %r13, 56(%rsp)
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
+ movl %edx, %r13d
+ movq %r14, 48(%rsp)
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+ btl %r12d, %r13d
+
+/* Call scalar math function */
+ jc L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+ incl %r12d
+ cmpl $8, %r12d
+
+/* Check bits in range mask */
+ jl L(RANGEMASK_CHECK)
+ kmovw 24(%rsp), %k4
+ cfi_restore(122)
+ kmovw 16(%rsp), %k5
+ cfi_restore(123)
+ kmovw 8(%rsp), %k6
+ cfi_restore(124)
+ kmovw (%rsp), %k7
+ cfi_restore(125)
+ vmovups 192(%rsp), %zmm0
+ movq 40(%rsp), %rsi
+ cfi_restore(4)
+ movq 32(%rsp), %rdi
+ cfi_restore(5)
+ movq 64(%rsp), %r12
+ cfi_restore(12)
+ movq 56(%rsp), %r13
+ cfi_restore(13)
+ movq 48(%rsp), %r14
+ cfi_restore(14)
+
+/* Go to exit */
+ jmp L(EXIT)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+ movl %r12d, %r14d
+ movsd 128(%rsp,%r14,8), %xmm0
+ call acos@PLT
+ movsd %xmm0, 192(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+ jmp L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVeN8v_acos_skx)
+
+ .section .rodata, "a"
+ .align 64
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(64)) VUINT32 SgnBit[8][2];
+ __declspec(align(64)) VUINT32 OneHalf[8][2];
+ __declspec(align(64)) VUINT32 SmallNorm[8][2];
+ __declspec(align(64)) VUINT32 dRsqrtMsk[8][2];
+ __declspec(align(64)) VUINT32 MOne[8][2];
+ __declspec(align(64)) VUINT32 HalfMask[8][2];
+ __declspec(align(64)) VUINT32 Two[8][2];
+ __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
+ __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
+ __declspec(align(64)) VUINT32 PiL[8][2];
+ __declspec(align(64)) VUINT32 PiH[8][2];
+ __declspec(align(64)) VUINT32 Pi2L[8][2];
+ __declspec(align(64)) VUINT32 Pi2H[8][2];
+ __declspec(align(64)) VUINT32 Zero[8][2];
+ __declspec(align(64)) VUINT32 SgnMask[8][2];
+ __declspec(align(64)) VUINT32 NanMask[8][2];
+ __declspec(align(64)) VUINT32 ep_coeff[6][8][2];
+ /* scalar part follow */
+ __declspec(align(64)) VUINT32 dInfs[2][2];
+ __declspec(align(64)) VUINT32 dOnes[2][2];
+ __declspec(align(64)) VUINT32 dZeros[2][2];
+} __svml_dacos_data_internal_t;
+#endif
+__svml_dacos_data_internal:
+ /*== SgnBit ==*/
+ .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+ /*== OneHalf ==*/
+ .align 64
+ .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
+ /*== SmallNorm ==*/
+ .align 64
+ .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
+ /*== dRsqrtMsk ==*/
+ .align 64
+ .quad 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000
+ /*== MOne ==*/
+ .align 64
+ .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+ /*== HalfMask ==*/
+ .align 64
+ .quad 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000
+ /*== Two ==*/
+ .align 64
+ .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
+ /*== sqrt_coeff[4] ==*/
+ .align 64
+ .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+ .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+ .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+ .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+ /*== poly_coeff[12] ==*/
+ .align 64
+ .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+ .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+ .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+ .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+ .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+ .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+ .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+ .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+ .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+ .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+ .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+ .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+ /*== PiL ==*/
+ .align 64
+ .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07
+ /*== PiH ==*/
+ .align 64
+ .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
+ /*== Pi2L ==*/
+ .align 64
+ .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
+ /*== Pi2H ==*/
+ .align 64
+ .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+ /*== Zero ==*/
+ .align 64
+ .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ /*== sgn mask ==*/
+ .align 64
+ .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+ /*== NaN mask ==*/
+ .align 64
+ .quad 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000
+ /*== ep_coeff[6] ==*/
+ .align 64
+ .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
+ .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
+ .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
+ .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
+ .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
+ .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
+ /* scalar part follows */
+ /*== dInfs = DP infinity, +/- ==*/
+ .align 64
+ .quad 0x7ff0000000000000, 0xfff0000000000000
+ /*== dOnes = DP one, +/- ==*/
+ .align 64
+ .quad 0x3ff0000000000000, 0xbff0000000000000
+ /*== dZeros = DP zero +/- ==*/
+ .align 64
+ .quad 0x0000000000000000, 0x8000000000000000
+ .align 64
+ .type __svml_dacos_data_internal,@object
+ .size __svml_dacos_data_internal,2496
new file mode 100644
@@ -0,0 +1,20 @@
+/* AVX2 version of vectorized acosf.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper
+#include "../svml_s_acosf16_core.S"
new file mode 100644
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized acosf, vector length is 16.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVeN16v_acosf
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf,
+ __redirect__ZGVeN16v_acosf)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,332 @@
+/* Function acosf vectorized with AVX-512.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * SelMask = (|x| >= 0.5) ? 1 : 0;
+ * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define _SgnBit 0
+#define _OneHalf 64
+#define _sRsqrtMsk 128
+#define _SmallNorm 192
+#define _MOne 256
+#define _HalfMask 320
+#define _SQMask 384
+#define _Two 448
+#define _sqrt_coeff_1 512
+#define _sqrt_coeff_2 576
+#define _poly_coeff_1 640
+#define _poly_coeff_2 704
+#define _poly_coeff_3 768
+#define _poly_coeff_4 832
+#define _poly_coeff_5 896
+#define _Pi2H 960
+#define _Pi2L 1024
+#define _PiH 1088
+#define _PiL 1152
+#define _Zero 1216
+#define _SgnMask 1280
+#define _NanMask 1344
+#define _ep_coeff_1 1408
+#define _ep_coeff_2 1472
+#define _ep_coeff_3 1536
+
+#include <sysdep.h>
+
+ .text
+ .section .text.exex512,"ax",@progbits
+ENTRY(_ZGVeN16v_acosf_skx)
+ pushq %rbp
+ cfi_def_cfa_offset(16)
+ movq %rsp, %rbp
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+ andq $-64, %rsp
+ subq $256, %rsp
+ vmovups __svml_sacos_data_internal(%rip), %zmm5
+ vmovups _OneHalf+__svml_sacos_data_internal(%rip), %zmm6
+
+/* SQ ~ 2*sqrt(Y) */
+ vmovups _SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
+ vmovups _MOne+__svml_sacos_data_internal(%rip), %zmm8
+ vmovups _Two+__svml_sacos_data_internal(%rip), %zmm12
+ vmovups _sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
+ vmovaps %zmm0, %zmm4
+
+/* x = -|arg| */
+ vorps %zmm4, %zmm5, %zmm3
+ vandps %zmm4, %zmm5, %zmm2
+ vmovups _sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
+
+/* Y = 0.5 + 0.5*(-x) */
+ vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
+
+/* x^2 */
+ vmulps {rn-sae}, %zmm3, %zmm3, %zmm7
+ vrsqrt14ps %zmm6, %zmm10
+ vcmpps $17, {sae}, %zmm9, %zmm6, %k2
+ vcmpps $22, {sae}, %zmm3, %zmm8, %k0
+ vmovups _poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
+ vminps {sae}, %zmm6, %zmm7, %zmm1
+ vmovups _poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
+ vxorps %zmm10, %zmm10, %zmm10{%k2}
+ vaddps {rn-sae}, %zmm6, %zmm6, %zmm14
+ vmulps {rn-sae}, %zmm1, %zmm1, %zmm8
+ vmulps {rn-sae}, %zmm10, %zmm10, %zmm11
+ vmulps {rn-sae}, %zmm10, %zmm14, %zmm5
+ vcmpps $21, {sae}, %zmm6, %zmm1, %k1
+
+/* X<X^2 iff X<0 */
+ vcmpps $17, {sae}, %zmm1, %zmm4, %k3
+
+/* polynomial */
+ vmovups _poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
+ vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
+ vmovups _poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
+ vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
+ vmovups _poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
+ vmovups _Pi2H+__svml_sacos_data_internal(%rip), %zmm12
+ vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
+ vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
+ vmulps {rn-sae}, %zmm14, %zmm5, %zmm15
+ vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
+ vxorps %zmm12, %zmm12, %zmm12{%k1}
+ vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
+ vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
+ kmovw %k1, %eax
+ kmovw %k3, %ecx
+ kmovw %k0, %edx
+ vmulps {rn-sae}, %zmm1, %zmm11, %zmm13
+ vblendmps %zmm0, %zmm3, %zmm0{%k1}
+ vxorps %zmm2, %zmm0, %zmm1
+ andl %eax, %ecx
+ kmovw %ecx, %k2
+ vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
+ vorps _PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k2}
+ vaddps {rn-sae}, %zmm13, %zmm12, %zmm0
+ testl %edx, %edx
+
+/* Go to special inputs processing branch */
+ jne L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+ movq %rbp, %rsp
+ popq %rbp
+ cfi_def_cfa(7, 8)
+ cfi_restore(6)
+ ret
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+ vmovups %zmm4, 128(%rsp)
+ vmovups %zmm0, 192(%rsp)
+
+/* Go to exit */
+ je L(EXIT)
+ xorl %eax, %eax
+ vzeroupper
+ kmovw %k4, 24(%rsp)
+ kmovw %k5, 16(%rsp)
+ kmovw %k6, 8(%rsp)
+ kmovw %k7, (%rsp)
+ movq %rsi, 40(%rsp)
+ movq %rdi, 32(%rsp)
+ movq %r12, 64(%rsp)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+ movl %eax, %r12d
+ movq %r13, 56(%rsp)
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
+ movl %edx, %r13d
+ movq %r14, 48(%rsp)
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+ btl %r12d, %r13d
+
+/* Call scalar math function */
+ jc L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+ incl %r12d
+ cmpl $16, %r12d
+
+/* Check bits in range mask */
+ jl L(RANGEMASK_CHECK)
+ kmovw 24(%rsp), %k4
+ cfi_restore(122)
+ kmovw 16(%rsp), %k5
+ cfi_restore(123)
+ kmovw 8(%rsp), %k6
+ cfi_restore(124)
+ kmovw (%rsp), %k7
+ cfi_restore(125)
+ vmovups 192(%rsp), %zmm0
+ movq 40(%rsp), %rsi
+ cfi_restore(4)
+ movq 32(%rsp), %rdi
+ cfi_restore(5)
+ movq 64(%rsp), %r12
+ cfi_restore(12)
+ movq 56(%rsp), %r13
+ cfi_restore(13)
+ movq 48(%rsp), %r14
+ cfi_restore(14)
+
+/* Go to exit */
+ jmp L(EXIT)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+ movl %r12d, %r14d
+ movss 128(%rsp,%r14,4), %xmm0
+ call acosf@PLT
+ movss %xmm0, 192(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+ jmp L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVeN16v_acosf_skx)
+
+ .section .rodata, "a"
+ .align 64
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(64)) VUINT32 SgnBit[16][1];
+ __declspec(align(64)) VUINT32 OneHalf[16][1];
+ __declspec(align(64)) VUINT32 sRsqrtMsk[16][1];
+ __declspec(align(64)) VUINT32 SmallNorm[16][1];
+ __declspec(align(64)) VUINT32 MOne[16][1];
+ __declspec(align(64)) VUINT32 HalfMask[16][1];
+ __declspec(align(64)) VUINT32 SQMask[16][1];
+ __declspec(align(64)) VUINT32 Two[16][1];
+ __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
+ __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
+ __declspec(align(64)) VUINT32 Pi2H[16][1];
+ __declspec(align(64)) VUINT32 Pi2L[16][1];
+ __declspec(align(64)) VUINT32 PiH[16][1];
+ __declspec(align(64)) VUINT32 PiL[16][1];
+ __declspec(align(64)) VUINT32 Zero[16][1];
+ __declspec(align(64)) VUINT32 SgnMask[16][1];
+ __declspec(align(64)) VUINT32 NanMask[16][1];
+ __declspec(align(64)) VUINT32 ep_coeff[3][16][1];
+} __svml_sacos_data_internal_t;
+#endif
+__svml_sacos_data_internal:
+ /*== SgnBit ==*/
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ /*== OneHalf ==*/
+ .align 64
+ .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+ /*== sRsqrtMsk ==*/
+ .align 64
+ .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
+ /*== SmallNorm ==*/
+ .align 64
+ .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+ /*== MOne ==*/
+ .align 64
+ .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+ /*== HalfMask ==*/
+ .align 64
+ .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
+ /*== SQMask ==*/
+ .align 64
+ .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
+ /*== Two ==*/
+ .align 64
+ .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+ /*== sqrt_coeff[2] ==*/
+ .align 64
+ .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+ .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+ /*== poly_coeff[5] ==*/
+ .align 64
+ .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+ .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+ .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+ .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+ .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+ /*== Pi2H ==*/
+ .align 64
+ .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+ /*== Pi2L ==*/
+ .align 64
+ .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
+ /*== PiH ==*/
+ .align 64
+ .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+ /*== PiL ==*/
+ .align 64
+ .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
+ /*== zero ==*/
+ .align 64
+ .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
+ /*== sgn mask ==*/
+ .align 64
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ /*== nan mask ==*/
+ .align 64
+ .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
+ /*== ep_coeff[3] ==*/
+ .align 64
+ .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
+ .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
+ .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
+ .align 64
+ .type __svml_sacos_data_internal,@object
+ .size __svml_sacos_data_internal,1600
new file mode 100644
@@ -0,0 +1,20 @@
+/* SSE2 version of vectorized acosf, vector length is 4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2
+#include "../svml_s_acosf4_core.S"
new file mode 100644
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized acosf, vector length is 4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVbN4v_acosf
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf,
+ __redirect__ZGVbN4v_acosf)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,351 @@
+/* Function acosf vectorized with SSE4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * SelMask = (|x| >= 0.5) ? 1 : 0;
+ * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define _SgnBit 0
+#define _OneHalf 64
+#define _sRsqrtMsk 128
+#define _SmallNorm 192
+#define _MOne 256
+#define _HalfMask 320
+#define _SQMask 384
+#define _Two 448
+#define _sqrt_coeff 512
+#define _poly_coeff 576
+#define _Pi2H 704
+#define _Pi2L 768
+#define _PiH 832
+#define _PiL 896
+#define _Zero 960
+#define _SgnMask 1024
+#define _NanMask 1088
+#define _ep_coeff 1152
+
+#include <sysdep.h>
+
+ .text
+ .section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN4v_acosf_sse4)
+ pushq %rbp
+ cfi_def_cfa_offset(16)
+ movq %rsp, %rbp
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm6
+
+/*
+ * 2*sqrt(X) ~ Sh - Sl (to 24+ bits)
+ * SQ ~ 2*sqrt(X)
+ */
+ movups __svml_sacos_data_internal(%rip), %xmm5
+ movups _OneHalf+__svml_sacos_data_internal(%rip), %xmm0
+
+/* x = -|arg| */
+ movaps %xmm5, %xmm7
+ orps %xmm6, %xmm7
+
+/* Y = 0.5 + 0.5*(-x) */
+ movaps %xmm0, %xmm2
+ mulps %xmm7, %xmm2
+
+/* x^2 */
+ movaps %xmm7, %xmm3
+ mulps %xmm7, %xmm3
+ addps %xmm2, %xmm0
+ movups _MOne+__svml_sacos_data_internal(%rip), %xmm4
+ andps %xmm6, %xmm5
+ cmpnleps %xmm7, %xmm4
+ minps %xmm0, %xmm3
+
+/* SQ ~ 2*sqrt(Y) */
+ rsqrtps %xmm0, %xmm1
+ movmskps %xmm4, %edx
+ movaps %xmm0, %xmm4
+ movaps %xmm3, %xmm2
+ movups %xmm8, 160(%rsp)
+ cmpltps _SmallNorm+__svml_sacos_data_internal(%rip), %xmm4
+ cmpnltps %xmm0, %xmm2
+ addps %xmm0, %xmm0
+ andnps %xmm1, %xmm4
+ movaps %xmm4, %xmm8
+ mulps %xmm4, %xmm8
+ mulps %xmm0, %xmm4
+ mulps %xmm8, %xmm0
+ movups _sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm1
+
+/* polynomial */
+ movups _poly_coeff+__svml_sacos_data_internal(%rip), %xmm8
+ mulps %xmm3, %xmm8
+ subps _Two+__svml_sacos_data_internal(%rip), %xmm0
+ mulps %xmm0, %xmm1
+ addps _poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm8
+ mulps %xmm4, %xmm0
+ addps _sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm1
+ mulps %xmm0, %xmm1
+ movaps %xmm3, %xmm0
+ mulps %xmm3, %xmm0
+ subps %xmm1, %xmm4
+ mulps %xmm0, %xmm8
+ movups _poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm1
+ andps %xmm2, %xmm4
+ mulps %xmm3, %xmm1
+ movups _PiH+__svml_sacos_data_internal(%rip), %xmm0
+ andps %xmm2, %xmm0
+ addps _poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm1
+ addps %xmm8, %xmm1
+
+/* X<X^2 iff X<0 */
+ movaps %xmm6, %xmm8
+ cmpltps %xmm3, %xmm8
+ mulps %xmm3, %xmm1
+ andps %xmm8, %xmm0
+ movaps %xmm2, %xmm8
+ andnps %xmm7, %xmm8
+ addps _poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm1
+ mulps %xmm3, %xmm1
+ orps %xmm4, %xmm8
+ pxor %xmm5, %xmm8
+ movaps %xmm2, %xmm3
+ mulps %xmm8, %xmm1
+ andnps _Pi2H+__svml_sacos_data_internal(%rip), %xmm3
+ addps %xmm1, %xmm8
+ addps %xmm3, %xmm0
+ addps %xmm8, %xmm0
+ testl %edx, %edx
+
+/* Go to special inputs processing branch */
+ jne L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+ movups 160(%rsp), %xmm8
+ cfi_restore(25)
+ movq %rbp, %rsp
+ popq %rbp
+ cfi_def_cfa(7, 8)
+ cfi_restore(6)
+ ret
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+ movups %xmm6, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ xorl %eax, %eax
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 120(%rsp)
+ movq %rdi, 112(%rsp)
+ movq %r12, 144(%rsp)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+ movl %eax, %r12d
+ movq %r13, 136(%rsp)
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+ movl %edx, %r13d
+ movq %r14, 128(%rsp)
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+ btl %r12d, %r13d
+
+/* Call scalar math function */
+ jc L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+ incl %r12d
+ cmpl $4, %r12d
+
+/* Check bits in range mask */
+ jl L(RANGEMASK_CHECK)
+ movups 96(%rsp), %xmm9
+ cfi_restore(26)
+ movups 80(%rsp), %xmm10
+ cfi_restore(27)
+ movups 64(%rsp), %xmm11
+ cfi_restore(28)
+ movups 48(%rsp), %xmm12
+ cfi_restore(29)
+ movups 32(%rsp), %xmm13
+ cfi_restore(30)
+ movups 16(%rsp), %xmm14
+ cfi_restore(31)
+ movups (%rsp), %xmm15
+ cfi_restore(32)
+ movq 120(%rsp), %rsi
+ cfi_restore(4)
+ movq 112(%rsp), %rdi
+ cfi_restore(5)
+ movq 144(%rsp), %r12
+ cfi_restore(12)
+ movq 136(%rsp), %r13
+ cfi_restore(13)
+ movq 128(%rsp), %r14
+ cfi_restore(14)
+ movups 256(%rsp), %xmm0
+
+/* Go to exit */
+ jmp L(EXIT)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+ movl %r12d, %r14d
+ movss 192(%rsp,%r14,4), %xmm0
+ call acosf@PLT
+ movss %xmm0, 256(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+ jmp L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVbN4v_acosf_sse4)
+
+ .section .rodata, "a"
+ .align 64
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(64)) VUINT32 SgnBit[4][1];
+ __declspec(align(64)) VUINT32 OneHalf[4][1];
+ __declspec(align(64)) VUINT32 sRsqrtMsk[4][1];
+ __declspec(align(64)) VUINT32 SmallNorm[4][1];
+ __declspec(align(64)) VUINT32 MOne[4][1];
+ __declspec(align(64)) VUINT32 HalfMask[4][1];
+ __declspec(align(64)) VUINT32 SQMask[4][1];
+ __declspec(align(64)) VUINT32 Two[4][1];
+ __declspec(align(64)) VUINT32 sqrt_coeff[2][4][1];
+ __declspec(align(64)) VUINT32 poly_coeff[5][4][1];
+ __declspec(align(64)) VUINT32 Pi2H[4][1];
+ __declspec(align(64)) VUINT32 Pi2L[4][1];
+ __declspec(align(64)) VUINT32 PiH[4][1];
+ __declspec(align(64)) VUINT32 PiL[4][1];
+ __declspec(align(64)) VUINT32 Zero[4][1];
+ __declspec(align(64)) VUINT32 SgnMask[4][1];
+ __declspec(align(64)) VUINT32 NanMask[4][1];
+ __declspec(align(64)) VUINT32 ep_coeff[3][4][1];
+} __svml_sacos_data_internal_t;
+#endif
+__svml_sacos_data_internal:
+ /*== SgnBit ==*/
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ /*== OneHalf ==*/
+ .align 64
+ .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+ /*== sRsqrtMsk ==*/
+ .align 64
+ .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
+ /*== SmallNorm ==*/
+ .align 64
+ .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+ /*== MOne ==*/
+ .align 64
+ .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+ /*== HalfMask ==*/
+ .align 64
+ .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
+ /*== SQMask ==*/
+ .align 64
+ .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
+ /*== Two ==*/
+ .align 64
+ .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
+ /*== sqrt_coeff[2] ==*/
+ .align 64
+ .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+ .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+ /*== poly_coeff[5] ==*/
+ .align 64
+ .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+ .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+ .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+ .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+ .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+ /*== Pi2H ==*/
+ .align 64
+ .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+ /*== Pi2L ==*/
+ .align 64
+ .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
+ /*== PiH ==*/
+ .align 64
+ .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+ /*== PiL ==*/
+ .align 64
+ .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
+ /*== zero ==*/
+ .align 64
+ .long 0x00000000, 0x00000000, 0x00000000, 0x00000000
+ /*== sgn mask ==*/
+ .align 64
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ /*== nan mask ==*/
+ .align 64
+ .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
+ /*== ep_coeff[3] ==*/
+ .align 64
+ .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
+ .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
+ .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
+ .align 64
+ .type __svml_sacos_data_internal,@object
+ .size __svml_sacos_data_internal,1216
new file mode 100644
@@ -0,0 +1,20 @@
+/* SSE version of vectorized acosf, vector length is 8.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper
+#include "../svml_s_acosf8_core.S"
new file mode 100644
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized acosf, vector length is 8.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVdN8v_acosf
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf,
+ __redirect__ZGVdN8v_acosf)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,332 @@
+/* Function acosf vectorized with AVX2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * SelMask = (|x| >= 0.5) ? 1 : 0;
+ * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define _SgnBit 0
+#define _OneHalf 64
+#define _sRsqrtMsk 128
+#define _SmallNorm 192
+#define _MOne 256
+#define _HalfMask 320
+#define _SQMask 384
+#define _Two 448
+#define _sqrt_coeff 512
+#define _poly_coeff 576
+#define _Pi2H 768
+#define _Pi2L 832
+#define _PiH 896
+#define _PiL 960
+#define _Zero 1024
+#define _SgnMask 1088
+#define _NanMask 1152
+#define _ep_coeff 1216
+
+#include <sysdep.h>
+
+ .text
+ .section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN8v_acosf_avx2)
+ pushq %rbp
+ cfi_def_cfa_offset(16)
+ movq %rsp, %rbp
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+ andq $-64, %rsp
+ subq $384, %rsp
+
+/*
+ * 2*sqrt(X) ~ Sh - Sl (to 24+ bits)
+ * SQ ~ 2*sqrt(X)
+ */
+ vmovups __svml_sacos_data_internal(%rip), %ymm6
+ vmovups _OneHalf+__svml_sacos_data_internal(%rip), %ymm7
+ vmovups %ymm8, 288(%rsp)
+ vmovups %ymm15, 352(%rsp)
+ vmovups %ymm9, 96(%rsp)
+ vmovups _poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm15
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 192(%rsp)
+ vmovups %ymm14, 320(%rsp)
+ vmovups %ymm13, 256(%rsp)
+ vmovups %ymm12, 224(%rsp)
+ vmovaps %ymm0, %ymm5
+
+/* x = -|arg| */
+ vorps %ymm5, %ymm6, %ymm4
+
+/* Y = 0.5 + 0.5*(-x) */
+ vfmadd231ps %ymm4, %ymm7, %ymm7
+
+/* x^2 */
+ vmulps %ymm4, %ymm4, %ymm1
+ vcmpnge_uqps _MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm0
+
+/* SQ ~ 2*sqrt(Y) */
+ vaddps %ymm7, %ymm7, %ymm11
+ vminps %ymm7, %ymm1, %ymm2
+ vrsqrtps %ymm7, %ymm8
+ vfmadd213ps _poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm15
+ vmulps %ymm2, %ymm2, %ymm14
+ vcmpnlt_uqps %ymm7, %ymm2, %ymm1
+ vandps %ymm5, %ymm6, %ymm3
+ vcmplt_oqps _SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm6
+ vandnps %ymm8, %ymm6, %ymm9
+
+/* polynomial */
+ vmovups _poly_coeff+__svml_sacos_data_internal(%rip), %ymm6
+ vmulps %ymm9, %ymm9, %ymm10
+ vmulps %ymm11, %ymm9, %ymm13
+ vfmadd213ps _poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm6
+ vfmsub213ps _Two+__svml_sacos_data_internal(%rip), %ymm10, %ymm11
+ vfmadd213ps %ymm15, %ymm14, %ymm6
+ vmulps %ymm11, %ymm13, %ymm12
+ vfmadd213ps _poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm6
+ vmovmskps %ymm0, %edx
+ vmovups _sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
+ vmulps %ymm6, %ymm2, %ymm9
+
+/* X<X^2 iff X<0 */
+ vcmplt_oqps %ymm2, %ymm5, %ymm6
+ vfmadd213ps _sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm11, %ymm0
+ vfnmadd213ps %ymm13, %ymm12, %ymm0
+ vblendvps %ymm1, %ymm0, %ymm4, %ymm4
+ vxorps %ymm3, %ymm4, %ymm3
+ vandps _PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
+ vfmadd213ps %ymm3, %ymm3, %ymm9
+ vandps %ymm6, %ymm2, %ymm2
+ vandnps _Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm7
+ vaddps %ymm7, %ymm2, %ymm8
+ vaddps %ymm9, %ymm8, %ymm0
+ testl %edx, %edx
+
+/* Go to special inputs processing branch */
+ jne L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+ vmovups 288(%rsp), %ymm8
+ cfi_restore(91)
+ vmovups 96(%rsp), %ymm9
+ cfi_restore(92)
+ vmovups 160(%rsp), %ymm10
+ cfi_restore(93)
+ vmovups 192(%rsp), %ymm11
+ cfi_restore(94)
+ vmovups 224(%rsp), %ymm12
+ cfi_restore(95)
+ vmovups 256(%rsp), %ymm13
+ cfi_restore(96)
+ vmovups 320(%rsp), %ymm14
+ cfi_restore(97)
+ vmovups 352(%rsp), %ymm15
+ cfi_restore(98)
+ movq %rbp, %rsp
+ popq %rbp
+ cfi_def_cfa(7, 8)
+ cfi_restore(6)
+ ret
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+ vmovups %ymm5, 64(%rsp)
+ vmovups %ymm0, 128(%rsp)
+
+/* Go to exit */
+ je L(EXIT)
+ xorl %eax, %eax
+ vzeroupper
+ movq %rsi, 8(%rsp)
+ movq %rdi, (%rsp)
+ movq %r12, 32(%rsp)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
+ movl %eax, %r12d
+ movq %r13, 24(%rsp)
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
+ movl %edx, %r13d
+ movq %r14, 16(%rsp)
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+ btl %r12d, %r13d
+
+/* Call scalar math function */
+ jc L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+ incl %r12d
+ cmpl $8, %r12d
+
+/* Check bits in range mask */
+ jl L(RANGEMASK_CHECK)
+ movq 8(%rsp), %rsi
+ cfi_restore(4)
+ movq (%rsp), %rdi
+ cfi_restore(5)
+ movq 32(%rsp), %r12
+ cfi_restore(12)
+ movq 24(%rsp), %r13
+ cfi_restore(13)
+ movq 16(%rsp), %r14
+ cfi_restore(14)
+ vmovups 128(%rsp), %ymm0
+
+/* Go to exit */
+ jmp L(EXIT)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+ movl %r12d, %r14d
+ movss 64(%rsp,%r14,4), %xmm0
+ call acosf@PLT
+ movss %xmm0, 128(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+ jmp L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVdN8v_acosf_avx2)
+
+ .section .rodata, "a"
+ .align 64
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(64)) VUINT32 SgnBit[8][1];
+ __declspec(align(64)) VUINT32 OneHalf[8][1];
+ __declspec(align(64)) VUINT32 sRsqrtMsk[8][1];
+ __declspec(align(64)) VUINT32 SmallNorm[8][1];
+ __declspec(align(64)) VUINT32 MOne[8][1];
+ __declspec(align(64)) VUINT32 HalfMask[8][1];
+ __declspec(align(64)) VUINT32 SQMask[8][1];
+ __declspec(align(64)) VUINT32 Two[8][1];
+ __declspec(align(64)) VUINT32 sqrt_coeff[2][8][1];
+ __declspec(align(64)) VUINT32 poly_coeff[5][8][1];
+ __declspec(align(64)) VUINT32 Pi2H[8][1];
+ __declspec(align(64)) VUINT32 Pi2L[8][1];
+ __declspec(align(64)) VUINT32 PiH[8][1];
+ __declspec(align(64)) VUINT32 PiL[8][1];
+ __declspec(align(64)) VUINT32 Zero[8][1];
+ __declspec(align(64)) VUINT32 SgnMask[8][1];
+ __declspec(align(64)) VUINT32 NanMask[8][1];
+ __declspec(align(64)) VUINT32 ep_coeff[3][8][1];
+} __svml_sacos_data_internal_t;
+#endif
+__svml_sacos_data_internal:
+ /*== SgnBit ==*/
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ /*== OneHalf ==*/
+ .align 64
+ .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+ /*== sRsqrtMsk ==*/
+ .align 64
+ .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
+ /*== SmallNorm ==*/
+ .align 64
+ .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+ /*== MOne ==*/
+ .align 64
+ .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+ /*== HalfMask ==*/
+ .align 64
+ .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
+ /*== SQMask ==*/
+ .align 64
+ .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
+ /*== Two ==*/
+ .align 64
+ .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+ /*== sqrt_coeff[2] ==*/
+ .align 64
+ .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+ .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+ /*== poly_coeff[5] ==*/
+ .align 64
+ .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+ .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+ .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+ .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+ .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+ /*== Pi2H ==*/
+ .align 64
+ .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+ /*== Pi2L ==*/
+ .align 64
+ .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
+ /*== PiH ==*/
+ .align 64
+ .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+ /*== PiL ==*/
+ .align 64
+ .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
+ /*== zero ==*/
+ .align 64
+ .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
+ /*== sgn mask ==*/
+ .align 64
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ /*== nan mask ==*/
+ .align 64
+ .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
+ /*== ep_coeff[3] ==*/
+ .align 64
+ .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
+ .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
+ .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
+ .align 64
+ .type __svml_sacos_data_internal,@object
+ .size __svml_sacos_data_internal,1344
new file mode 100644
@@ -0,0 +1,29 @@
+/* Function acos vectorized with SSE2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVbN2v_acos)
+WRAPPER_IMPL_SSE2 acos
+END (_ZGVbN2v_acos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_acos)
+#endif
new file mode 100644
@@ -0,0 +1,29 @@
+/* Function acos vectorized with AVX2, wrapper version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVdN4v_acos)
+WRAPPER_IMPL_AVX _ZGVbN2v_acos
+END (_ZGVdN4v_acos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_acos)
+#endif
new file mode 100644
@@ -0,0 +1,25 @@
+/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVcN4v_acos)
+WRAPPER_IMPL_AVX _ZGVbN2v_acos
+END (_ZGVcN4v_acos)
new file mode 100644
@@ -0,0 +1,25 @@
+/* Function acos vectorized with AVX-512, wrapper to AVX2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN8v_acos)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_acos
+END (_ZGVeN8v_acos)
new file mode 100644
@@ -0,0 +1,25 @@
+/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN16v_acosf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf
+END (_ZGVeN16v_acosf)
new file mode 100644
@@ -0,0 +1,29 @@
+/* Function acosf vectorized with SSE2, wrapper version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVbN4v_acosf)
+WRAPPER_IMPL_SSE2 acosf
+END (_ZGVbN4v_acosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_acosf)
+#endif
new file mode 100644
@@ -0,0 +1,29 @@
+/* Function acosf vectorized with AVX2, wrapper version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVdN8v_acosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_acosf
+END (_ZGVdN8v_acosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_acosf)
+#endif
new file mode 100644
@@ -0,0 +1,25 @@
+/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVcN8v_acosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_acosf
+END (_ZGVcN8v_acosf)
new file mode 100644
@@ -0,0 +1 @@
+#include "test-double-libmvec-acos.c"
new file mode 100644
@@ -0,0 +1 @@
+#include "test-double-libmvec-acos.c"
new file mode 100644
@@ -0,0 +1 @@
+#include "test-double-libmvec-acos.c"
new file mode 100644
@@ -0,0 +1,3 @@
+#define LIBMVEC_TYPE double
+#define LIBMVEC_FUNC acos
+#include "test-vector-abi-arg1.h"
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
#define VEC_INT_TYPE __m128i
@@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
#define VEC_INT_TYPE __m128i
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i
new file mode 100644
@@ -0,0 +1 @@
+#include "test-float-libmvec-acosf.c"
new file mode 100644
@@ -0,0 +1 @@
+#include "test-float-libmvec-acosf.c"
new file mode 100644
@@ -0,0 +1 @@
+#include "test-float-libmvec-acosf.c"
new file mode 100644
@@ -0,0 +1,3 @@
+#define LIBMVEC_TYPE float
+#define LIBMVEC_FUNC acosf
+#include "test-vector-abi-arg1.h"
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
#define VEC_INT_TYPE __m512i
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
#define VEC_INT_TYPE __m128i
@@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF
@@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
#define VEC_INT_TYPE __m128i