[v5,01/18] x86-64: Add vector atan/atanf implementation to libmvec
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
Implement vectorized atan/atanf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI. It also contains
accuracy and ABI tests for vector atan/atanf with regenerated ulps.
---
bits/libm-simd-decl-stubs.h | 11 +
math/bits/mathcalls.h | 2 +-
.../unix/sysv/linux/x86_64/libmvec.abilist | 8 +
sysdeps/x86/fpu/bits/math-vector.h | 4 +
.../x86/fpu/finclude/math-vector-fortran.h | 4 +
sysdeps/x86_64/fpu/Makeconfig | 1 +
sysdeps/x86_64/fpu/Versions | 2 +
sysdeps/x86_64/fpu/libm-test-ulps | 20 ++
.../fpu/multiarch/svml_d_atan2_core-sse2.S | 20 ++
.../x86_64/fpu/multiarch/svml_d_atan2_core.c | 27 ++
.../fpu/multiarch/svml_d_atan2_core_sse4.S | 245 ++++++++++++++++++
.../fpu/multiarch/svml_d_atan4_core-sse.S | 20 ++
.../x86_64/fpu/multiarch/svml_d_atan4_core.c | 27 ++
.../fpu/multiarch/svml_d_atan4_core_avx2.S | 225 ++++++++++++++++
.../fpu/multiarch/svml_d_atan8_core-avx2.S | 20 ++
.../x86_64/fpu/multiarch/svml_d_atan8_core.c | 27 ++
.../fpu/multiarch/svml_d_atan8_core_avx512.S | 213 +++++++++++++++
.../fpu/multiarch/svml_s_atanf16_core-avx2.S | 20 ++
.../fpu/multiarch/svml_s_atanf16_core.c | 28 ++
.../multiarch/svml_s_atanf16_core_avx512.S | 174 +++++++++++++
.../fpu/multiarch/svml_s_atanf4_core-sse2.S | 20 ++
.../x86_64/fpu/multiarch/svml_s_atanf4_core.c | 28 ++
.../fpu/multiarch/svml_s_atanf4_core_sse4.S | 164 ++++++++++++
.../fpu/multiarch/svml_s_atanf8_core-sse.S | 20 ++
.../x86_64/fpu/multiarch/svml_s_atanf8_core.c | 28 ++
.../fpu/multiarch/svml_s_atanf8_core_avx2.S | 148 +++++++++++
sysdeps/x86_64/fpu/svml_d_atan2_core.S | 29 +++
sysdeps/x86_64/fpu/svml_d_atan4_core.S | 29 +++
sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S | 25 ++
sysdeps/x86_64/fpu/svml_d_atan8_core.S | 25 ++
sysdeps/x86_64/fpu/svml_s_atanf16_core.S | 25 ++
sysdeps/x86_64/fpu/svml_s_atanf4_core.S | 29 +++
sysdeps/x86_64/fpu/svml_s_atanf8_core.S | 29 +++
sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S | 25 ++
.../x86_64/fpu/test-double-libmvec-atan-avx.c | 1 +
.../fpu/test-double-libmvec-atan-avx2.c | 1 +
.../fpu/test-double-libmvec-atan-avx512f.c | 1 +
sysdeps/x86_64/fpu/test-double-libmvec-atan.c | 3 +
.../x86_64/fpu/test-double-vlen2-wrappers.c | 1 +
.../fpu/test-double-vlen4-avx2-wrappers.c | 1 +
.../x86_64/fpu/test-double-vlen4-wrappers.c | 1 +
.../x86_64/fpu/test-double-vlen8-wrappers.c | 1 +
.../x86_64/fpu/test-float-libmvec-atanf-avx.c | 1 +
.../fpu/test-float-libmvec-atanf-avx2.c | 1 +
.../fpu/test-float-libmvec-atanf-avx512f.c | 1 +
sysdeps/x86_64/fpu/test-float-libmvec-atanf.c | 3 +
.../x86_64/fpu/test-float-vlen16-wrappers.c | 1 +
.../x86_64/fpu/test-float-vlen4-wrappers.c | 1 +
.../fpu/test-float-vlen8-avx2-wrappers.c | 1 +
.../x86_64/fpu/test-float-vlen8-wrappers.c | 1 +
50 files changed, 1741 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core-sse2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core_sse4.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core-sse.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core-avx2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core_avx512.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core-avx2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core-sse2.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core-sse.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
create mode 100644 sysdeps/x86_64/fpu/svml_d_atan2_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_d_atan4_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S
create mode 100644 sysdeps/x86_64/fpu/svml_d_atan8_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_atanf16_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_atanf4_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_atanf8_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S
create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-atan-avx.c
create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-atan-avx2.c
create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-atan-avx512f.c
create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-atan.c
create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx.c
create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx2.c
create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx512f.c
create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-atanf.c
Comments
On Tue, Dec 28, 2021 at 10:39:43PM -0800, Sunil K Pandey wrote:
> Implement vectorized atan/atanf containing SSE, AVX, AVX2 and
> AVX512 versions for libmvec as per vector ABI. It also contains
> accuracy and ABI tests for vector atan/atanf with regenerated ulps.
> ---
> bits/libm-simd-decl-stubs.h | 11 +
> math/bits/mathcalls.h | 2 +-
> .../unix/sysv/linux/x86_64/libmvec.abilist | 8 +
> sysdeps/x86/fpu/bits/math-vector.h | 4 +
> .../x86/fpu/finclude/math-vector-fortran.h | 4 +
> sysdeps/x86_64/fpu/Makeconfig | 1 +
> sysdeps/x86_64/fpu/Versions | 2 +
> sysdeps/x86_64/fpu/libm-test-ulps | 20 ++
> .../fpu/multiarch/svml_d_atan2_core-sse2.S | 20 ++
> .../x86_64/fpu/multiarch/svml_d_atan2_core.c | 27 ++
> .../fpu/multiarch/svml_d_atan2_core_sse4.S | 245 ++++++++++++++++++
> .../fpu/multiarch/svml_d_atan4_core-sse.S | 20 ++
> .../x86_64/fpu/multiarch/svml_d_atan4_core.c | 27 ++
> .../fpu/multiarch/svml_d_atan4_core_avx2.S | 225 ++++++++++++++++
> .../fpu/multiarch/svml_d_atan8_core-avx2.S | 20 ++
> .../x86_64/fpu/multiarch/svml_d_atan8_core.c | 27 ++
> .../fpu/multiarch/svml_d_atan8_core_avx512.S | 213 +++++++++++++++
> .../fpu/multiarch/svml_s_atanf16_core-avx2.S | 20 ++
> .../fpu/multiarch/svml_s_atanf16_core.c | 28 ++
> .../multiarch/svml_s_atanf16_core_avx512.S | 174 +++++++++++++
> .../fpu/multiarch/svml_s_atanf4_core-sse2.S | 20 ++
> .../x86_64/fpu/multiarch/svml_s_atanf4_core.c | 28 ++
> .../fpu/multiarch/svml_s_atanf4_core_sse4.S | 164 ++++++++++++
> .../fpu/multiarch/svml_s_atanf8_core-sse.S | 20 ++
> .../x86_64/fpu/multiarch/svml_s_atanf8_core.c | 28 ++
> .../fpu/multiarch/svml_s_atanf8_core_avx2.S | 148 +++++++++++
> sysdeps/x86_64/fpu/svml_d_atan2_core.S | 29 +++
> sysdeps/x86_64/fpu/svml_d_atan4_core.S | 29 +++
> sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S | 25 ++
> sysdeps/x86_64/fpu/svml_d_atan8_core.S | 25 ++
> sysdeps/x86_64/fpu/svml_s_atanf16_core.S | 25 ++
> sysdeps/x86_64/fpu/svml_s_atanf4_core.S | 29 +++
> sysdeps/x86_64/fpu/svml_s_atanf8_core.S | 29 +++
> sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S | 25 ++
> .../x86_64/fpu/test-double-libmvec-atan-avx.c | 1 +
> .../fpu/test-double-libmvec-atan-avx2.c | 1 +
> .../fpu/test-double-libmvec-atan-avx512f.c | 1 +
> sysdeps/x86_64/fpu/test-double-libmvec-atan.c | 3 +
> .../x86_64/fpu/test-double-vlen2-wrappers.c | 1 +
> .../fpu/test-double-vlen4-avx2-wrappers.c | 1 +
> .../x86_64/fpu/test-double-vlen4-wrappers.c | 1 +
> .../x86_64/fpu/test-double-vlen8-wrappers.c | 1 +
> .../x86_64/fpu/test-float-libmvec-atanf-avx.c | 1 +
> .../fpu/test-float-libmvec-atanf-avx2.c | 1 +
> .../fpu/test-float-libmvec-atanf-avx512f.c | 1 +
> sysdeps/x86_64/fpu/test-float-libmvec-atanf.c | 3 +
> .../x86_64/fpu/test-float-vlen16-wrappers.c | 1 +
> .../x86_64/fpu/test-float-vlen4-wrappers.c | 1 +
> .../fpu/test-float-vlen8-avx2-wrappers.c | 1 +
> .../x86_64/fpu/test-float-vlen8-wrappers.c | 1 +
> 50 files changed, 1741 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core-sse2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core_sse4.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core-sse.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core-avx2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core_avx512.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core-avx2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core-sse2.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core-sse.S
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core.c
> create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
> create mode 100644 sysdeps/x86_64/fpu/svml_d_atan2_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_d_atan4_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S
> create mode 100644 sysdeps/x86_64/fpu/svml_d_atan8_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_s_atanf16_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_s_atanf4_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_s_atanf8_core.S
> create mode 100644 sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S
> create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-atan-avx.c
> create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-atan-avx2.c
> create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-atan-avx512f.c
> create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-atan.c
> create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx.c
> create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx2.c
> create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx512f.c
> create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-atanf.c
>
> diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
> index 2ccdd1fc53..b4647ca918 100644
> --- a/bits/libm-simd-decl-stubs.h
> +++ b/bits/libm-simd-decl-stubs.h
> @@ -109,4 +109,15 @@
> #define __DECL_SIMD_acosf32x
> #define __DECL_SIMD_acosf64x
> #define __DECL_SIMD_acosf128x
> +
> +#define __DECL_SIMD_atan
> +#define __DECL_SIMD_atanf
> +#define __DECL_SIMD_atanl
> +#define __DECL_SIMD_atanf16
> +#define __DECL_SIMD_atanf32
> +#define __DECL_SIMD_atanf64
> +#define __DECL_SIMD_atanf128
> +#define __DECL_SIMD_atanf32x
> +#define __DECL_SIMD_atanf64x
> +#define __DECL_SIMD_atanf128x
> #endif
> diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
> index 2cc6654208..3e27c21f21 100644
> --- a/math/bits/mathcalls.h
> +++ b/math/bits/mathcalls.h
> @@ -54,7 +54,7 @@ __MATHCALL_VEC (acos,, (_Mdouble_ __x));
> /* Arc sine of X. */
> __MATHCALL (asin,, (_Mdouble_ __x));
> /* Arc tangent of X. */
> -__MATHCALL (atan,, (_Mdouble_ __x));
> +__MATHCALL_VEC (atan,, (_Mdouble_ __x));
> /* Arc tangent of Y/X. */
> __MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x));
>
> diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> index b37b55777e..a93258db6f 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> @@ -47,10 +47,18 @@ GLIBC_2.22 _ZGVeN8v_sin F
> GLIBC_2.22 _ZGVeN8vv_pow F
> GLIBC_2.22 _ZGVeN8vvv_sincos F
> GLIBC_2.35 _ZGVbN2v_acos F
> +GLIBC_2.35 _ZGVbN2v_atan F
> GLIBC_2.35 _ZGVbN4v_acosf F
> +GLIBC_2.35 _ZGVbN4v_atanf F
> GLIBC_2.35 _ZGVcN4v_acos F
> +GLIBC_2.35 _ZGVcN4v_atan F
> GLIBC_2.35 _ZGVcN8v_acosf F
> +GLIBC_2.35 _ZGVcN8v_atanf F
> GLIBC_2.35 _ZGVdN4v_acos F
> +GLIBC_2.35 _ZGVdN4v_atan F
> GLIBC_2.35 _ZGVdN8v_acosf F
> +GLIBC_2.35 _ZGVdN8v_atanf F
> GLIBC_2.35 _ZGVeN16v_acosf F
> +GLIBC_2.35 _ZGVeN16v_atanf F
> GLIBC_2.35 _ZGVeN8v_acos F
> +GLIBC_2.35 _ZGVeN8v_atan F
> diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h
> index dabb74cbb9..1c0e5c5e35 100644
> --- a/sysdeps/x86/fpu/bits/math-vector.h
> +++ b/sysdeps/x86/fpu/bits/math-vector.h
> @@ -62,6 +62,10 @@
> # define __DECL_SIMD_acos __DECL_SIMD_x86_64
> # undef __DECL_SIMD_acosf
> # define __DECL_SIMD_acosf __DECL_SIMD_x86_64
> +# undef __DECL_SIMD_atan
> +# define __DECL_SIMD_atan __DECL_SIMD_x86_64
> +# undef __DECL_SIMD_atanf
> +# define __DECL_SIMD_atanf __DECL_SIMD_x86_64
>
> # endif
> #endif
> diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> index 4bcbd1fbce..ddcccb11d7 100644
> --- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> +++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> @@ -30,6 +30,8 @@
> !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
> !GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
> !GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
> +!GCC$ builtin (atan) attributes simd (notinbranch) if('x86_64')
> +!GCC$ builtin (atanf) attributes simd (notinbranch) if('x86_64')
>
> !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
> !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
> @@ -45,3 +47,5 @@
> !GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
> !GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
> !GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
> +!GCC$ builtin (atan) attributes simd (notinbranch) if('x32')
> +!GCC$ builtin (atanf) attributes simd (notinbranch) if('x32')
> diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig
> index 7acf1f306c..dae0887f13 100644
> --- a/sysdeps/x86_64/fpu/Makeconfig
> +++ b/sysdeps/x86_64/fpu/Makeconfig
> @@ -23,6 +23,7 @@ postclean-generated += libmvec.mk
> # Define for both math and mathvec directories.
> libmvec-funcs = \
> acos \
> + atan \
> cos \
> exp \
> log \
> diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
> index 2985fe7ca7..424f6d526e 100644
> --- a/sysdeps/x86_64/fpu/Versions
> +++ b/sysdeps/x86_64/fpu/Versions
> @@ -15,6 +15,8 @@ libmvec {
> }
> GLIBC_2.35 {
> _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
> + _ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
> _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
> + _ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
> }
> }
> diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
> index 6c12976c82..2e64e59803 100644
> --- a/sysdeps/x86_64/fpu/libm-test-ulps
> +++ b/sysdeps/x86_64/fpu/libm-test-ulps
> @@ -164,6 +164,26 @@ float: 2
> float128: 2
> ldouble: 1
>
> +Function: "atan_vlen16":
> +float: 1
> +
> +Function: "atan_vlen2":
> +double: 1
> +
> +Function: "atan_vlen4":
> +double: 1
> +float: 1
> +
> +Function: "atan_vlen4_avx2":
> +double: 1
> +
> +Function: "atan_vlen8":
> +double: 1
> +float: 1
> +
> +Function: "atan_vlen8_avx2":
> +float: 1
> +
> Function: "atanh":
> double: 2
> float: 2
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core-sse2.S
> new file mode 100644
> index 0000000000..115e5223aa
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core-sse2.S
> @@ -0,0 +1,20 @@
> +/* SSE2 version of vectorized atan, vector length is 2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVbN2v_atan _ZGVbN2v_atan_sse2
> +#include "../svml_d_atan2_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core.c
> new file mode 100644
> index 0000000000..93f079ffcb
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized atan, vector length is 2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVbN2v_atan
> +#include "ifunc-mathvec-sse4_1.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVbN2v_atan, __GI__ZGVbN2v_atan, __redirect__ZGVbN2v_atan)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core_sse4.S
> new file mode 100644
> index 0000000000..f0ad036b9e
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan2_core_sse4.S
> @@ -0,0 +1,245 @@
> +/* Function atan vectorized with SSE4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
> + * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
> + * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
> + * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
> + * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
> + * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
> + *
> + */
> +
> +/* Offsets for data table __svml_datan_data_internal_avx512
> + */
> +#define AbsMask 0
> +#define Shifter 16
> +#define MaxThreshold 32
> +#define MOne 48
> +#define One 64
> +#define LargeX 80
> +#define Zero 96
> +#define Tbl_H 112
> +#define Tbl_L 368
> +#define dIndexMed 624
> +#define Pi2 640
> +#define Pi2_low 656
> +#define coeff 672
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.sse4,"ax",@progbits
> +ENTRY(_ZGVbN2v_atan_sse4)
> + lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx
> + movups __svml_datan_data_internal_avx512(%rip), %xmm4
> + movups Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3
> + andps %xmm0, %xmm4
> + movaps %xmm3, %xmm12
> + movaps %xmm4, %xmm5
> + addpd %xmm4, %xmm12
> + movaps %xmm12, %xmm7
> +
> +/*
> + * table lookup sequence
> + * VPERMUTE not available
> + */
> + movaps %xmm12, %xmm10
> + subpd %xmm3, %xmm7
> + subpd %xmm7, %xmm5
> + mulpd %xmm4, %xmm7
> + movups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2
> + psllq $3, %xmm10
> +
> +/* saturate X range */
> + movups LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8
> + pxor %xmm4, %xmm0
> + cmplepd %xmm4, %xmm2
> + addpd One+__svml_datan_data_internal_avx512(%rip), %xmm7
> + minpd %xmm4, %xmm8
> + movups MOne+__svml_datan_data_internal_avx512(%rip), %xmm6
> + movaps %xmm2, %xmm1
> + movaps %xmm2, %xmm9
> + andnps %xmm5, %xmm1
> + andps %xmm2, %xmm6
> + andnps %xmm7, %xmm9
> + andps %xmm2, %xmm8
> + orps %xmm6, %xmm1
> + orps %xmm8, %xmm9
> +
> +/* R+Rl = DiffX/Y */
> + divpd %xmm9, %xmm1
> + pand .FLT_11(%rip), %xmm10
> +
> +/* set table value to Pi/2 for large X */
> + movups Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4
> + movd %xmm10, %eax
> + andps %xmm2, %xmm4
> + pshufd $2, %xmm10, %xmm11
> + movaps %xmm2, %xmm10
> +
> +/* polynomial evaluation */
> + movaps %xmm1, %xmm2
> + mulpd %xmm1, %xmm2
> + movd %xmm11, %edx
> + movups coeff+__svml_datan_data_internal_avx512(%rip), %xmm5
> + movaps %xmm2, %xmm7
> + movups coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6
> + movaps %xmm2, %xmm9
> + mulpd %xmm2, %xmm5
> + mulpd %xmm2, %xmm7
> + addpd coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5
> + mulpd %xmm2, %xmm6
> + mulpd %xmm7, %xmm5
> + addpd coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6
> + mulpd %xmm1, %xmm9
> + addpd %xmm5, %xmm6
> + movups coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8
> + mulpd %xmm2, %xmm8
> + mulpd %xmm6, %xmm7
> + addpd coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8
> + addpd %xmm7, %xmm8
> + mulpd %xmm8, %xmm9
> + movups dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14
> + cmplepd %xmm12, %xmm14
> + addpd %xmm9, %xmm1
> + movslq %eax, %rax
> + movaps %xmm14, %xmm3
> + movslq %edx, %rdx
> + movsd -128(%rax,%rcx), %xmm13
> + movsd (%rcx,%rax), %xmm15
> + movhpd -128(%rdx,%rcx), %xmm13
> + movhpd (%rcx,%rdx), %xmm15
> + andnps %xmm13, %xmm3
> + andps %xmm14, %xmm15
> + orps %xmm15, %xmm3
> + andnps %xmm3, %xmm10
> + orps %xmm4, %xmm10
> + addpd %xmm1, %xmm10
> + pxor %xmm10, %xmm0
> + ret
> +
> +END(_ZGVbN2v_atan_sse4)
> +
> + .section .rodata, "a"
> + .align 16
> +
> +#ifdef __svml_datan_data_internal_avx512_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(16)) VUINT32 AbsMask[2][2];
> + __declspec(align(16)) VUINT32 Shifter[2][2];
> + __declspec(align(16)) VUINT32 MaxThreshold[2][2];
> + __declspec(align(16)) VUINT32 MOne[2][2];
> + __declspec(align(16)) VUINT32 One[2][2];
> + __declspec(align(16)) VUINT32 LargeX[2][2];
> + __declspec(align(16)) VUINT32 Zero[2][2];
> + __declspec(align(16)) VUINT32 Tbl_H[32][2];
> + __declspec(align(16)) VUINT32 Tbl_L[32][2];
> + __declspec(align(16)) VUINT32 dIndexMed[2][2];
> + __declspec(align(16)) VUINT32 Pi2[2][2];
> + __declspec(align(16)) VUINT32 Pi2_low[2][2];
> + __declspec(align(16)) VUINT32 coeff[6][2][2];
> + } __svml_datan_data_internal_avx512;
> +#endif
> +__svml_datan_data_internal_avx512:
> + /*== AbsMask ==*/
> + .quad 0x7fffffffffffffff, 0x7fffffffffffffff
> + /*== Shifter ==*/
> + .align 16
> + .quad 0x4318000000000000, 0x4318000000000000
> + /*== MaxThreshold ==*/
> + .align 16
> + .quad 0x401f800000000000, 0x401f800000000000
> + /*== MOne ==*/
> + .align 16
> + .quad 0xbff0000000000000, 0xbff0000000000000
> + /*== One ==*/
> + .align 16
> + .quad 0x3ff0000000000000, 0x3ff0000000000000
> + /*== LargeX ==*/
> + .align 16
> + .quad 0x47f0000000000000, 0x47f0000000000000
> + /*== Zero ==*/
> + .align 16
> + .quad 0x0000000000000000, 0x0000000000000000
> + /*== Tbl_H ==*/
> + .align 16
> + .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
> + .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
> + .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
> + .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
> + .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
> + .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
> + .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
> + .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
> + .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
> + .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
> + .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
> + .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
> + .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
> + .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
> + .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
> + .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
> + /*== Tbl_L ==*/
> + .align 16
> + .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
> + .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
> + .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
> + .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
> + .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
> + .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
> + .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
> + .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
> + .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
> + .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
> + .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
> + .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
> + .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
> + .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
> + .quad 0xbc929c86447928e7, 0xbc8957a7170df016
> + .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
> + /*== dIndexMed ==*/
> + .align 16
> + .quad 0x4318000000000010, 0x4318000000000010
> + /*== Pi2 ==*/
> + .align 16
> + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
> + /*== Pi2_low ==*/
> + .align 16
> + .quad 0x3c91a62633145c07, 0x3c91a62633145c07
> + /*== coeff6 ==*/
> + .align 16
> + .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
> + .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc
> + .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
> + .quad 0xbfc249248eef04da, 0xbfc249248eef04da
> + .quad 0x3fc999999998741e, 0x3fc999999998741e
> + .quad 0xbfd555555555554d, 0xbfd555555555554d
> + .align 16
> + .type __svml_datan_data_internal_avx512,@object
> + .size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
> + .align 16
> +
> +.FLT_11:
> + .long 0x00000078,0x00000000,0x00000078,0x00000000
> + .type .FLT_11,@object
> + .size .FLT_11,16
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core-sse.S
> new file mode 100644
> index 0000000000..79c48dbc91
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core-sse.S
> @@ -0,0 +1,20 @@
> +/* SSE version of vectorized atan, vector length is 4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVdN4v_atan _ZGVdN4v_atan_sse_wrapper
> +#include "../svml_d_atan4_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core.c
> new file mode 100644
> index 0000000000..64ce66b9fd
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized atan, vector length is 4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVdN4v_atan
> +#include "ifunc-mathvec-avx2.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVdN4v_atan, __GI__ZGVdN4v_atan, __redirect__ZGVdN4v_atan)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S
> new file mode 100644
> index 0000000000..50336514d7
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan4_core_avx2.S
> @@ -0,0 +1,225 @@
> +/* Function atan vectorized with AVX2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
> + * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
> + * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
> + * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
> + * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
> + * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
> + *
> + */
> +
> +/* Offsets for data table __svml_datan_data_internal_avx512
> + */
> +#define AbsMask 0
> +#define Shifter 32
> +#define MaxThreshold 64
> +#define MOne 96
> +#define One 128
> +#define LargeX 160
> +#define Zero 192
> +#define Tbl_H 224
> +#define Tbl_L 480
> +#define dIndexMed 736
> +#define Pi2 768
> +#define Pi2_low 800
> +#define coeff 832
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.avx2,"ax",@progbits
> +ENTRY(_ZGVdN4v_atan_avx2)
> + lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rdi
> + vmovupd Shifter+__svml_datan_data_internal_avx512(%rip), %ymm4
> + vmovupd One+__svml_datan_data_internal_avx512(%rip), %ymm9
> +
> +/* saturate X range */
> + vmovupd LargeX+__svml_datan_data_internal_avx512(%rip), %ymm6
> + vandpd __svml_datan_data_internal_avx512(%rip), %ymm0, %ymm7
> + vaddpd %ymm4, %ymm7, %ymm2
> + vcmpge_oqpd MaxThreshold+__svml_datan_data_internal_avx512(%rip), %ymm7, %ymm3
> + vminpd %ymm7, %ymm6, %ymm10
> + vsubpd %ymm4, %ymm2, %ymm5
> +
> +/*
> + * table lookup sequence
> + * VPERMUTE not available
> + */
> + vpsllq $3, %ymm2, %ymm13
> + vsubpd %ymm5, %ymm7, %ymm8
> + vcmpge_oqpd dIndexMed+__svml_datan_data_internal_avx512(%rip), %ymm2, %ymm2
> + vfmadd231pd %ymm7, %ymm5, %ymm9
> + vpand .FLT_11(%rip), %ymm13, %ymm14
> + vblendvpd %ymm3, MOne+__svml_datan_data_internal_avx512(%rip), %ymm8, %ymm11
> + vblendvpd %ymm3, %ymm10, %ymm9, %ymm12
> + vxorpd %ymm0, %ymm7, %ymm1
> +
> +/* R+Rl = DiffX/Y */
> + vdivpd %ymm12, %ymm11, %ymm0
> + vextractf128 $1, %ymm14, %xmm4
> + vmovd %xmm14, %eax
> + vmovd %xmm4, %ecx
> + movslq %eax, %rax
> + vpextrd $2, %xmm14, %edx
> + movslq %ecx, %rcx
> + vpextrd $2, %xmm4, %esi
> + movslq %edx, %rdx
> + movslq %esi, %rsi
> + vmovsd -128(%rax,%rdi), %xmm15
> + vmovsd (%rdi,%rax), %xmm7
> + vmovsd -128(%rcx,%rdi), %xmm5
> + vmovsd (%rdi,%rcx), %xmm9
> + vmovhpd -128(%rdx,%rdi), %xmm15, %xmm15
> + vmovhpd (%rdi,%rdx), %xmm7, %xmm8
> + vmovhpd -128(%rsi,%rdi), %xmm5, %xmm6
> + vmovhpd (%rdi,%rsi), %xmm9, %xmm10
> +
> +/* polynomial evaluation */
> + vmulpd %ymm0, %ymm0, %ymm5
> + vmulpd %ymm5, %ymm5, %ymm4
> + vinsertf128 $1, %xmm6, %ymm15, %ymm11
> + vinsertf128 $1, %xmm10, %ymm8, %ymm12
> + vblendvpd %ymm2, %ymm12, %ymm11, %ymm13
> + vmovupd coeff+__svml_datan_data_internal_avx512(%rip), %ymm8
> + vmovupd coeff+64+__svml_datan_data_internal_avx512(%rip), %ymm2
> + vmulpd %ymm5, %ymm0, %ymm6
> + vfmadd213pd coeff+32+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm8
> + vfmadd213pd coeff+96+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm2
> +
> +/* set table value to Pi/2 for large X */
> + vblendvpd %ymm3, Pi2+__svml_datan_data_internal_avx512(%rip), %ymm13, %ymm7
> + vmovupd coeff+128+__svml_datan_data_internal_avx512(%rip), %ymm3
> + vfmadd213pd %ymm2, %ymm4, %ymm8
> + vfmadd213pd coeff+160+__svml_datan_data_internal_avx512(%rip), %ymm3, %ymm5
> + vfmadd213pd %ymm5, %ymm4, %ymm8
> + vfmadd213pd %ymm0, %ymm6, %ymm8
> + vaddpd %ymm8, %ymm7, %ymm0
> + vxorpd %ymm1, %ymm0, %ymm0
> + ret
> +
> +END(_ZGVdN4v_atan_avx2)
> +
> + .section .rodata, "a"
> + .align 32
> +
> +.FLT_11:
> + .long 0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000
> + .type .FLT_11,@object
> + .size .FLT_11,32
> + .align 32
> +
> +#ifdef __svml_datan_data_internal_avx512_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(32)) VUINT32 AbsMask[4][2];
> + __declspec(align(32)) VUINT32 Shifter[4][2];
> + __declspec(align(32)) VUINT32 MaxThreshold[4][2];
> + __declspec(align(32)) VUINT32 MOne[4][2];
> + __declspec(align(32)) VUINT32 One[4][2];
> + __declspec(align(32)) VUINT32 LargeX[4][2];
> + __declspec(align(32)) VUINT32 Zero[4][2];
> + __declspec(align(32)) VUINT32 Tbl_H[32][2];
> + __declspec(align(32)) VUINT32 Tbl_L[32][2];
> + __declspec(align(32)) VUINT32 dIndexMed[4][2];
> + __declspec(align(32)) VUINT32 Pi2[4][2];
> + __declspec(align(32)) VUINT32 Pi2_low[4][2];
> + __declspec(align(32)) VUINT32 coeff[6][4][2];
> + } __svml_datan_data_internal_avx512;
> +#endif
> +__svml_datan_data_internal_avx512:
> + /*== AbsMask ==*/
> + .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
> + /*== Shifter ==*/
> + .align 32
> + .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
> + /*== MaxThreshold ==*/
> + .align 32
> + .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
> + /*== MOne ==*/
> + .align 32
> + .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
> + /*== One ==*/
> + .align 32
> + .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
> + /*== LargeX ==*/
> + .align 32
> + .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
> + /*== Zero ==*/
> + .align 32
> + .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
> + /*== Tbl_H ==*/
> + .align 32
> + .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
> + .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
> + .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
> + .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
> + .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
> + .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
> + .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
> + .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
> + .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
> + .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
> + .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
> + .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
> + .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
> + .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
> + .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
> + .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
> + /*== Tbl_L ==*/
> + .align 32
> + .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
> + .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
> + .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
> + .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
> + .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
> + .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
> + .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
> + .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
> + .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
> + .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
> + .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
> + .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
> + .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
> + .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
> + .quad 0xbc929c86447928e7, 0xbc8957a7170df016
> + .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
> + /*== dIndexMed ==*/
> + .align 32
> + .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
> + /*== Pi2 ==*/
> + .align 32
> + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
> + /*== Pi2_low ==*/
> + .align 32
> + .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
> + /*== coeff6 ==*/
> + .align 32
> + .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
> + .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
> + .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
> + .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
> + .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
> + .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
> + .align 32
> + .type __svml_datan_data_internal_avx512,@object
> + .size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core-avx2.S
> new file mode 100644
> index 0000000000..723734e10b
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core-avx2.S
> @@ -0,0 +1,20 @@
> +/* AVX2 version of vectorized atan, vector length is 8.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVeN8v_atan _ZGVeN8v_atan_avx2_wrapper
> +#include "../svml_d_atan8_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core.c
> new file mode 100644
> index 0000000000..e97a41b6bc
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized atan, vector length is 8.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVeN8v_atan
> +#include "ifunc-mathvec-avx512-skx.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVeN8v_atan, __GI__ZGVeN8v_atan, __redirect__ZGVeN8v_atan)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core_avx512.S
> new file mode 100644
> index 0000000000..fa6cb47308
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan8_core_avx512.S
> @@ -0,0 +1,213 @@
> +/* Function atan vectorized with AVX-512.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
> + * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
> + * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
> + * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
> + * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
> + * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
> + *
> + */
> +
> +/* Offsets for data table __svml_datan_data_internal_avx512
> + */
> +#define AbsMask 0
> +#define Shifter 64
> +#define MaxThreshold 128
> +#define MOne 192
> +#define One 256
> +#define LargeX 320
> +#define Zero 384
> +#define Tbl_H 448
> +#define dIndexMed 704
> +#define Pi2 768
> +#define coeff_1 832
> +#define coeff_2 896
> +#define coeff_3 960
> +#define coeff_4 1024
> +#define coeff_5 1088
> +#define coeff_6 1152
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.evex512,"ax",@progbits
> +ENTRY(_ZGVeN8v_atan_skx)
> + vmovups Shifter+__svml_datan_data_internal_avx512(%rip), %zmm4
> + vmovups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %zmm3
> + vmovups One+__svml_datan_data_internal_avx512(%rip), %zmm9
> +
> +/* saturate X range */
> + vmovups LargeX+__svml_datan_data_internal_avx512(%rip), %zmm7
> + vandpd __svml_datan_data_internal_avx512(%rip), %zmm0, %zmm8
> +
> +/* R+Rl = DiffX/Y */
> + vbroadcastsd .FLT_10(%rip), %zmm15
> + vaddpd {rn-sae}, %zmm4, %zmm8, %zmm2
> + vxorpd %zmm0, %zmm8, %zmm1
> + vcmppd $29, {sae}, %zmm3, %zmm8, %k2
> +
> +/* round to 2 bits after binary point */
> + vreducepd $40, {sae}, %zmm8, %zmm6
> + vsubpd {rn-sae}, %zmm4, %zmm2, %zmm5
> +
> +/*
> + * if|X|>=MaxThreshold, set DiffX=-1
> + * VMSUB(D, DiffX, LargeMask, Zero, One);
> + */
> + vblendmpd MOne+__svml_datan_data_internal_avx512(%rip), %zmm6, %zmm10{%k2}
> + vfmadd231pd {rn-sae}, %zmm8, %zmm5, %zmm9
> + vmovups dIndexMed+__svml_datan_data_internal_avx512(%rip), %zmm5
> +
> +/* table lookup sequence */
> + vmovups Tbl_H+__svml_datan_data_internal_avx512(%rip), %zmm6
> + vgetmantpd $0, {sae}, %zmm10, %zmm14
> + vgetexppd {sae}, %zmm10, %zmm11
> + vmovups coeff_5+__svml_datan_data_internal_avx512(%rip), %zmm10
> +
> +/*
> + * if|X|>=MaxThreshold, set Y=X
> + * VMADD(D, Y, LargeMask, X, Zero);
> + */
> + vminpd {sae}, %zmm8, %zmm7, %zmm9{%k2}
> + vcmppd $29, {sae}, %zmm5, %zmm2, %k1
> + vmovups Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %zmm7
> + vmovups coeff_1+__svml_datan_data_internal_avx512(%rip), %zmm8
> + vgetmantpd $0, {sae}, %zmm9, %zmm3
> + vgetexppd {sae}, %zmm9, %zmm12
> + vmovups coeff_3+__svml_datan_data_internal_avx512(%rip), %zmm9
> + vpermt2pd Tbl_H+64+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm6
> + vsubpd {rn-sae}, %zmm12, %zmm11, %zmm4
> + vpermt2pd Tbl_H+192+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm7
> + vrcp14pd %zmm3, %zmm13
> + vmovups coeff_4+__svml_datan_data_internal_avx512(%rip), %zmm12
> + vmovups coeff_6+__svml_datan_data_internal_avx512(%rip), %zmm11
> + vblendmpd %zmm7, %zmm6, %zmm2{%k1}
> + vmulpd {rn-sae}, %zmm13, %zmm14, %zmm0
> + vfnmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
> + vfnmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm3
> + vfmadd213pd {rn-sae}, %zmm15, %zmm15, %zmm15
> + vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm15
> + vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm3
> + vscalefpd {rn-sae}, %zmm4, %zmm3, %zmm0
> +
> +/* set table value to Pi/2 for large X */
> + vblendmpd Pi2+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm3{%k2}
> + vmovups coeff_2+__svml_datan_data_internal_avx512(%rip), %zmm2
> +
> +/* polynomial evaluation */
> + vmulpd {rn-sae}, %zmm0, %zmm0, %zmm14
> + vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13
> + vmulpd {rn-sae}, %zmm0, %zmm14, %zmm15
> + vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm2
> + vfmadd231pd {rn-sae}, %zmm14, %zmm9, %zmm12
> + vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm14
> + vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm2
> + vfmadd213pd {rn-sae}, %zmm14, %zmm13, %zmm2
> + vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm2
> + vaddpd {rn-sae}, %zmm3, %zmm2, %zmm0
> + vxorpd %zmm1, %zmm0, %zmm0
> + ret
> +
> +END(_ZGVeN8v_atan_skx)
> +
> + .section .rodata, "a"
> + .align 64
> +
> +#ifdef __svml_datan_data_internal_avx512_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(64)) VUINT32 AbsMask[8][2];
> + __declspec(align(64)) VUINT32 Shifter[8][2];
> + __declspec(align(64)) VUINT32 MaxThreshold[8][2];
> + __declspec(align(64)) VUINT32 MOne[8][2];
> + __declspec(align(64)) VUINT32 One[8][2];
> + __declspec(align(64)) VUINT32 LargeX[8][2];
> + __declspec(align(64)) VUINT32 Zero[8][2];
> + __declspec(align(64)) VUINT32 Tbl_H[32][2];
> + __declspec(align(64)) VUINT32 dIndexMed[8][2];
> + __declspec(align(64)) VUINT32 Pi2[8][2];
> + __declspec(align(64)) VUINT32 coeff[6][8][2];
> + } __svml_datan_data_internal_avx512;
> +#endif
> +__svml_datan_data_internal_avx512:
> + /*== AbsMask ==*/
> + .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
> + /*== Shifter ==*/
> + .align 64
> + .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
> + /*== MaxThreshold ==*/
> + .align 64
> + .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
> + /*== MOne ==*/
> + .align 64
> + .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
> + /*== One ==*/
> + .align 64
> + .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
> + /*== LargeX ==*/
> + .align 64
> + .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
> + /*== Zero ==*/
> + .align 64
> + .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
> + /*== Tbl_H ==*/
> + .align 64
> + .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
> + .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
> + .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
> + .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
> + .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
> + .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
> + .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
> + .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
> + .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
> + .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
> + .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
> + .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
> + .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
> + .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
> + .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
> + .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
> + /*== dIndexMed ==*/
> + .align 64
> + .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
> + /*== Pi2 ==*/
> + .align 64
> + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
> + /*== coeff6 ==*/
> + .align 64
> + .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
> + .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
> + .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
> + .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
> + .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
> + .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
> + .align 64
> + .type __svml_datan_data_internal_avx512,@object
> + .size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
> + .align 8
> +
> +.FLT_10:
> + .long 0x00000000,0x3ff00000
> + .type .FLT_10,@object
> + .size .FLT_10,8
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core-avx2.S
> new file mode 100644
> index 0000000000..27623cdf16
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core-avx2.S
> @@ -0,0 +1,20 @@
> +/* AVX2 version of vectorized atanf.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVeN16v_atanf _ZGVeN16v_atanf_avx2_wrapper
> +#include "../svml_s_atanf16_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core.c
> new file mode 100644
> index 0000000000..940de26615
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized atanf, vector length is 16.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVeN16v_atanf
> +#include "ifunc-mathvec-avx512-skx.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVeN16v_atanf, __GI__ZGVeN16v_atanf,
> + __redirect__ZGVeN16v_atanf)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
> new file mode 100644
> index 0000000000..4a37f03e69
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf16_core_avx512.S
> @@ -0,0 +1,174 @@
> +/* Function atanf vectorized with AVX-512.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
> + * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
> + * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
> + * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
> + * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
> + * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
> + *
> + */
> +
> +/* Offsets for data table __svml_satan_data_internal_avx512
> + */
> +#define AbsMask 0
> +#define Shifter 64
> +#define MaxThreshold 128
> +#define MOne 192
> +#define One 256
> +#define LargeX 320
> +#define Zero 384
> +#define Tbl_H 448
> +#define Pi2 576
> +#define coeff_1 640
> +#define coeff_2 704
> +#define coeff_3 768
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.exex512,"ax",@progbits
> +ENTRY(_ZGVeN16v_atanf_skx)
> + vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
> + vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
> + vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
> +
> +/* round to 2 bits after binary point */
> + vreduceps $40, {sae}, %zmm7, %zmm5
> +
> +/* saturate X range */
> + vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
> + vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
> + vcmpps $29, {sae}, %zmm3, %zmm7, %k1
> +
> +/* table lookup sequence */
> + vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
> + vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
> + vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
> + vxorps %zmm0, %zmm7, %zmm0
> + vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
> + vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
> +
> +/* if|X|>=MaxThreshold, set DiffX=-1 */
> + vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
> + vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
> +
> +/* if|X|>=MaxThreshold, set Y=X */
> + vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
> +
> +/* R+Rl = DiffX/Y */
> + vgetmantps $0, {sae}, %zmm9, %zmm12
> + vgetexpps {sae}, %zmm9, %zmm10
> + vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
> + vgetmantps $0, {sae}, %zmm8, %zmm15
> + vgetexpps {sae}, %zmm8, %zmm11
> + vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
> +
> +/* set table value to Pi/2 for large X */
> + vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
> + vrcp14ps %zmm15, %zmm13
> + vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
> + vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
> + vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
> + vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
> + vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
> +
> +/* polynomial evaluation */
> + vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
> + vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
> + vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
> + vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
> + vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
> + vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
> + vxorps %zmm0, %zmm10, %zmm0
> + ret
> +
> +END(_ZGVeN16v_atanf_skx)
> +
> + .section .rodata, "a"
> + .align 64
> +
> +#ifdef __svml_satan_data_internal_avx512_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(64)) VUINT32 AbsMask[16][1];
> + __declspec(align(64)) VUINT32 Shifter[16][1];
> + __declspec(align(64)) VUINT32 MaxThreshold[16][1];
> + __declspec(align(64)) VUINT32 MOne[16][1];
> + __declspec(align(64)) VUINT32 One[16][1];
> + __declspec(align(64)) VUINT32 LargeX[16][1];
> + __declspec(align(64)) VUINT32 Zero[16][1];
> + __declspec(align(64)) VUINT32 Tbl_H[32][1];
> + __declspec(align(64)) VUINT32 Pi2[16][1];
> + __declspec(align(64)) VUINT32 coeff[3][16][1];
> + } __svml_satan_data_internal_avx512;
> +#endif
> +__svml_satan_data_internal_avx512:
> + /*== AbsMask ==*/
> + .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> + /*== Shifter ==*/
> + .align 64
> + .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
> + /*== MaxThreshold ==*/
> + .align 64
> + .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
> + /*== MOne ==*/
> + .align 64
> + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> + /*== One ==*/
> + .align 64
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> + /*== LargeX ==*/
> + .align 64
> + .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
> + /*== Zero ==*/
> + .align 64
> + .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
> + /*== Tbl_H ==*/
> + .align 64
> + .long 0x00000000, 0x3e7adbb0
> + .long 0x3eed6338, 0x3f24bc7d
> + .long 0x3f490fdb, 0x3f6563e3
> + .long 0x3f7b985f, 0x3f869c79
> + .long 0x3f8db70d, 0x3f93877b
> + .long 0x3f985b6c, 0x3f9c6b53
> + .long 0x3f9fe0bb, 0x3fa2daa4
> + .long 0x3fa57088, 0x3fa7b46f
> + .long 0x3fa9b465, 0x3fab7b7a
> + .long 0x3fad1283, 0x3fae809e
> + .long 0x3fafcb99, 0x3fb0f836
> + .long 0x3fb20a6a, 0x3fb30581
> + .long 0x3fb3ec43, 0x3fb4c10a
> + .long 0x3fb585d7, 0x3fb63c64
> + .long 0x3fb6e62c, 0x3fb78478
> + .long 0x3fb81868, 0x3fb8a2f5
> + /*== Pi2 ==*/
> + .align 64
> + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> + /*== coeff3 ==*/
> + .align 64
> + .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
> + .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
> + .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
> + .align 64
> + .type __svml_satan_data_internal_avx512,@object
> + .size __svml_satan_data_internal_avx512,.-__svml_satan_data_internal_avx512
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core-sse2.S
> new file mode 100644
> index 0000000000..fe81170666
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core-sse2.S
> @@ -0,0 +1,20 @@
> +/* SSE2 version of vectorized atanf, vector length is 4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVbN4v_atanf _ZGVbN4v_atanf_sse2
> +#include "../svml_s_atanf4_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core.c
> new file mode 100644
> index 0000000000..975ece6812
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized atanf, vector length is 4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVbN4v_atanf
> +#include "ifunc-mathvec-sse4_1.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVbN4v_atanf, __GI__ZGVbN4v_atanf,
> + __redirect__ZGVbN4v_atanf)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
> new file mode 100644
> index 0000000000..c58a894e10
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf4_core_sse4.S
> @@ -0,0 +1,164 @@
> +/* Function atanf vectorized with SSE4.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
> + * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
> + * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
> + * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
> + * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
> + * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
> + *
> + */
> +
> +/* Offsets for data table __svml_satan_data_internal
> + */
> +#define _sSIGN_MASK 0
> +#define _sABS_MASK 16
> +#define _sONE 32
> +#define _sPIO2 48
> +#define _sPC8 64
> +#define _sPC7 80
> +#define _sPC6 96
> +#define _sPC5 112
> +#define _sPC4 128
> +#define _sPC3 144
> +#define _sPC2 160
> +#define _sPC1 176
> +#define _sPC0 192
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.sse4,"ax",@progbits
> +ENTRY(_ZGVbN4v_atanf_sse4)
> +/*
> + * To use minps\maxps operations for argument reduction
> + * uncomment _AT_USEMINMAX_ definition
> + * Declarations
> + * Variables
> + * Constants
> + */
> + movups _sABS_MASK+__svml_satan_data_internal(%rip), %xmm2
> +
> +/*
> + * 1) If x>1, then r=-1/x, PIO2=Pi/2
> + * 2) If -1<=x<=1, then r=x, PIO2=0
> + * 3) If x<-1, then r=-1/x, PIO2=-Pi/2
> + */
> + movups _sONE+__svml_satan_data_internal(%rip), %xmm1
> + andps %xmm0, %xmm2
> + movaps %xmm2, %xmm9
> + movaps %xmm1, %xmm3
> + cmpleps %xmm1, %xmm9
> + maxps %xmm2, %xmm3
> + minps %xmm2, %xmm1
> + divps %xmm3, %xmm1
> + movups __svml_satan_data_internal(%rip), %xmm4
> + movaps %xmm9, %xmm10
> + andps %xmm4, %xmm0
> + andnps %xmm4, %xmm9
> + pxor %xmm0, %xmm9
> + pxor %xmm1, %xmm9
> +
> +/* Polynomial. */
> + movaps %xmm9, %xmm8
> + mulps %xmm9, %xmm8
> + movaps %xmm8, %xmm7
> + mulps %xmm8, %xmm7
> + movups _sPC8+__svml_satan_data_internal(%rip), %xmm6
> + mulps %xmm7, %xmm6
> + movups _sPC7+__svml_satan_data_internal(%rip), %xmm5
> + mulps %xmm7, %xmm5
> + addps _sPC6+__svml_satan_data_internal(%rip), %xmm6
> + mulps %xmm7, %xmm6
> + addps _sPC5+__svml_satan_data_internal(%rip), %xmm5
> + mulps %xmm7, %xmm5
> + addps _sPC4+__svml_satan_data_internal(%rip), %xmm6
> + mulps %xmm7, %xmm6
> + addps _sPC3+__svml_satan_data_internal(%rip), %xmm5
> + mulps %xmm5, %xmm7
> + addps _sPC2+__svml_satan_data_internal(%rip), %xmm6
> + mulps %xmm8, %xmm6
> + addps _sPC1+__svml_satan_data_internal(%rip), %xmm7
> + andnps _sPIO2+__svml_satan_data_internal(%rip), %xmm10
> + addps %xmm6, %xmm7
> + mulps %xmm7, %xmm8
> + pxor %xmm0, %xmm10
> + addps _sPC0+__svml_satan_data_internal(%rip), %xmm8
> +
> +/* Reconstruction. */
> + mulps %xmm8, %xmm9
> + addps %xmm9, %xmm10
> + movaps %xmm10, %xmm0
> + ret
> +
> +END(_ZGVbN4v_atanf_sse4)
> +
> + .section .rodata, "a"
> + .align 16
> +
> +#ifdef __svml_satan_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
> + __declspec(align(16)) VUINT32 _sABS_MASK[4][1];
> + __declspec(align(16)) VUINT32 _sONE[4][1];
> + __declspec(align(16)) VUINT32 _sPIO2[4][1];
> + __declspec(align(16)) VUINT32 _sPC8[4][1];
> + __declspec(align(16)) VUINT32 _sPC7[4][1];
> + __declspec(align(16)) VUINT32 _sPC6[4][1];
> + __declspec(align(16)) VUINT32 _sPC5[4][1];
> + __declspec(align(16)) VUINT32 _sPC4[4][1];
> + __declspec(align(16)) VUINT32 _sPC3[4][1];
> + __declspec(align(16)) VUINT32 _sPC2[4][1];
> + __declspec(align(16)) VUINT32 _sPC1[4][1];
> + __declspec(align(16)) VUINT32 _sPC0[4][1];
> +} __svml_satan_data_internal;
> +#endif
> +__svml_satan_data_internal:
> + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
> + .align 16
> + .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
> + .align 16
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
> + .align 16
> + .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
> + .align 16
> + .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
> + .align 16
> + .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
> + .align 16
> + .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
> + .align 16
> + .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
> + .align 16
> + .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
> + .align 16
> + .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
> + .align 16
> + .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
> + .align 16
> + .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
> + .align 16
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
> + .align 16
> + .type __svml_satan_data_internal,@object
> + .size __svml_satan_data_internal,.-__svml_satan_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core-sse.S
> new file mode 100644
> index 0000000000..1652a8f5c6
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core-sse.S
> @@ -0,0 +1,20 @@
> +/* SSE version of vectorized atanf, vector length is 8.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define _ZGVdN8v_atanf _ZGVdN8v_atanf_sse_wrapper
> +#include "../svml_s_atanf8_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core.c
> new file mode 100644
> index 0000000000..733d8c3bc3
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized atanf, vector length is 8.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define SYMBOL_NAME _ZGVdN8v_atanf
> +#include "ifunc-mathvec-avx2.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVdN8v_atanf, __GI__ZGVdN8v_atanf,
> + __redirect__ZGVdN8v_atanf)
> + __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
> new file mode 100644
> index 0000000000..e333f979c4
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanf8_core_avx2.S
> @@ -0,0 +1,148 @@
> +/* Function atanf vectorized with AVX2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + https://www.gnu.org/licenses/. */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
> + * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
> + * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
> + * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
> + * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
> + * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
> + *
> + */
> +
> +/* Offsets for data table __svml_satan_data_internal
> + */
> +#define _sSIGN_MASK 0
> +#define _sABS_MASK 32
> +#define _sONE 64
> +#define _sPIO2 96
> +#define _sPC8 128
> +#define _sPC7 160
> +#define _sPC6 192
> +#define _sPC5 224
> +#define _sPC4 256
> +#define _sPC3 288
> +#define _sPC2 320
> +#define _sPC1 352
> +#define _sPC0 384
> +
> +#include <sysdep.h>
> +
> + .text
> + .section .text.avx2,"ax",@progbits
> +ENTRY(_ZGVdN8v_atanf_avx2)
> +/*
> + * 1) If x>1, then r=-1/x, PIO2=Pi/2
> + * 2) If -1<=x<=1, then r=x, PIO2=0
> + * 3) If x<-1, then r=-1/x, PIO2=-Pi/2
> + */
> + vmovups _sONE+__svml_satan_data_internal(%rip), %ymm2
> + vmovups __svml_satan_data_internal(%rip), %ymm7
> + vmovups _sPC7+__svml_satan_data_internal(%rip), %ymm13
> +
> +/*
> + * To use minps\maxps operations for argument reduction
> + * uncomment _AT_USEMINMAX_ definition
> + * Declarations
> + * Variables
> + * Constants
> + */
> + vandps _sABS_MASK+__svml_satan_data_internal(%rip), %ymm0, %ymm3
> + vmaxps %ymm3, %ymm2, %ymm5
> + vminps %ymm3, %ymm2, %ymm4
> + vcmple_oqps %ymm2, %ymm3, %ymm6
> + vdivps %ymm5, %ymm4, %ymm11
> + vandps %ymm7, %ymm0, %ymm9
> + vandnps %ymm7, %ymm6, %ymm8
> + vxorps %ymm9, %ymm8, %ymm10
> + vxorps %ymm11, %ymm10, %ymm15
> +
> +/* Polynomial. */
> + vmulps %ymm15, %ymm15, %ymm14
> + vmovups _sPC8+__svml_satan_data_internal(%rip), %ymm0
> + vmulps %ymm14, %ymm14, %ymm12
> + vfmadd213ps _sPC6+__svml_satan_data_internal(%rip), %ymm12, %ymm0
> + vfmadd213ps _sPC5+__svml_satan_data_internal(%rip), %ymm12, %ymm13
> + vfmadd213ps _sPC4+__svml_satan_data_internal(%rip), %ymm12, %ymm0
> + vfmadd213ps _sPC3+__svml_satan_data_internal(%rip), %ymm12, %ymm13
> + vfmadd213ps _sPC2+__svml_satan_data_internal(%rip), %ymm12, %ymm0
> + vfmadd213ps _sPC1+__svml_satan_data_internal(%rip), %ymm12, %ymm13
> + vfmadd213ps %ymm13, %ymm14, %ymm0
> + vfmadd213ps _sPC0+__svml_satan_data_internal(%rip), %ymm14, %ymm0
> + vandnps _sPIO2+__svml_satan_data_internal(%rip), %ymm6, %ymm1
> + vxorps %ymm9, %ymm1, %ymm1
> +
> +/* Reconstruction. */
> + vfmadd213ps %ymm1, %ymm15, %ymm0
> + ret
> +
> +END(_ZGVdN8v_atanf_avx2)
> +
> + .section .rodata, "a"
> + .align 32
> +
> +#ifdef __svml_satan_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> + __declspec(align(32)) VUINT32 _sSIGN_MASK[8][1];
> + __declspec(align(32)) VUINT32 _sABS_MASK[8][1];
> + __declspec(align(32)) VUINT32 _sONE[8][1];
> + __declspec(align(32)) VUINT32 _sPIO2[8][1];
> + __declspec(align(32)) VUINT32 _sPC8[8][1];
> + __declspec(align(32)) VUINT32 _sPC7[8][1];
> + __declspec(align(32)) VUINT32 _sPC6[8][1];
> + __declspec(align(32)) VUINT32 _sPC5[8][1];
> + __declspec(align(32)) VUINT32 _sPC4[8][1];
> + __declspec(align(32)) VUINT32 _sPC3[8][1];
> + __declspec(align(32)) VUINT32 _sPC2[8][1];
> + __declspec(align(32)) VUINT32 _sPC1[8][1];
> + __declspec(align(32)) VUINT32 _sPC0[8][1];
> +} __svml_satan_data_internal;
> +#endif
> +__svml_satan_data_internal:
> + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
> + .align 32
> + .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
> + .align 32
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
> + .align 32
> + .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
> + .align 32
> + .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
> + .align 32
> + .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
> + .align 32
> + .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
> + .align 32
> + .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
> + .align 32
> + .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
> + .align 32
> + .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
> + .align 32
> + .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
> + .align 32
> + .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
> + .align 32
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
> + .align 32
> + .type __svml_satan_data_internal,@object
> + .size __svml_satan_data_internal,.-__svml_satan_data_internal
> diff --git a/sysdeps/x86_64/fpu/svml_d_atan2_core.S b/sysdeps/x86_64/fpu/svml_d_atan2_core.S
> new file mode 100644
> index 0000000000..e86d5b7047
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_atan2_core.S
> @@ -0,0 +1,29 @@
> +/* Function atan vectorized with SSE2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVbN2v_atan)
> +WRAPPER_IMPL_SSE2 atan
> +END (_ZGVbN2v_atan)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVbN2v_atan)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_d_atan4_core.S b/sysdeps/x86_64/fpu/svml_d_atan4_core.S
> new file mode 100644
> index 0000000000..eb11fd2f17
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_atan4_core.S
> @@ -0,0 +1,29 @@
> +/* Function atan vectorized with AVX2, wrapper version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVdN4v_atan)
> +WRAPPER_IMPL_AVX _ZGVbN2v_atan
> +END (_ZGVdN4v_atan)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVdN4v_atan)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S
> new file mode 100644
> index 0000000000..b83a4be33d
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_atan4_core_avx.S
> @@ -0,0 +1,25 @@
> +/* Function atan vectorized in AVX ISA as wrapper to SSE4 ISA version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVcN4v_atan)
> +WRAPPER_IMPL_AVX _ZGVbN2v_atan
> +END (_ZGVcN4v_atan)
> diff --git a/sysdeps/x86_64/fpu/svml_d_atan8_core.S b/sysdeps/x86_64/fpu/svml_d_atan8_core.S
> new file mode 100644
> index 0000000000..9685a32bdc
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_atan8_core.S
> @@ -0,0 +1,25 @@
> +/* Function atan vectorized with AVX-512, wrapper to AVX2.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVeN8v_atan)
> +WRAPPER_IMPL_AVX512 _ZGVdN4v_atan
> +END (_ZGVeN8v_atan)
> diff --git a/sysdeps/x86_64/fpu/svml_s_atanf16_core.S b/sysdeps/x86_64/fpu/svml_s_atanf16_core.S
> new file mode 100644
> index 0000000000..f82d2422ae
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_atanf16_core.S
> @@ -0,0 +1,25 @@
> +/* Function atanf vectorized with AVX-512. Wrapper to AVX2 version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVeN16v_atanf)
> +WRAPPER_IMPL_AVX512 _ZGVdN8v_atanf
> +END (_ZGVeN16v_atanf)
> diff --git a/sysdeps/x86_64/fpu/svml_s_atanf4_core.S b/sysdeps/x86_64/fpu/svml_s_atanf4_core.S
> new file mode 100644
> index 0000000000..6b8c4d9624
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_atanf4_core.S
> @@ -0,0 +1,29 @@
> +/* Function atanf vectorized with SSE2, wrapper version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVbN4v_atanf)
> +WRAPPER_IMPL_SSE2 atanf
> +END (_ZGVbN4v_atanf)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVbN4v_atanf)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_s_atanf8_core.S b/sysdeps/x86_64/fpu/svml_s_atanf8_core.S
> new file mode 100644
> index 0000000000..315681f6c0
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_atanf8_core.S
> @@ -0,0 +1,29 @@
> +/* Function atanf vectorized with AVX2, wrapper version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVdN8v_atanf)
> +WRAPPER_IMPL_AVX _ZGVbN4v_atanf
> +END (_ZGVdN8v_atanf)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVdN8v_atanf)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S
> new file mode 100644
> index 0000000000..b9cd502186
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_atanf8_core_avx.S
> @@ -0,0 +1,25 @@
> +/* Function atanf vectorized in AVX ISA as wrapper to SSE4 ISA version.
> + Copyright (C) 2021 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> + .text
> +ENTRY (_ZGVcN8v_atanf)
> +WRAPPER_IMPL_AVX _ZGVbN4v_atanf
> +END (_ZGVcN8v_atanf)
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx.c
> new file mode 100644
> index 0000000000..0f7176a20b
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-atan.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx2.c
> new file mode 100644
> index 0000000000..0f7176a20b
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx2.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-atan.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx512f.c
> new file mode 100644
> index 0000000000..0f7176a20b
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-atan-avx512f.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-atan.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-atan.c b/sysdeps/x86_64/fpu/test-double-libmvec-atan.c
> new file mode 100644
> index 0000000000..982687b169
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-atan.c
> @@ -0,0 +1,3 @@
> +#define LIBMVEC_TYPE double
> +#define LIBMVEC_FUNC atan
> +#include "test-vector-abi-arg1.h"
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> index 0abc7d2021..467c913990 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> @@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
> VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
> VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
> +VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVbN2v_atan)
>
> #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> index dda093b914..b72a7de84e 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> @@ -31,6 +31,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
> VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
> VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
> +VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVdN4v_atan)
>
> #ifndef __ILP32__
> # define VEC_INT_TYPE __m256i
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> index f3230463bb..d2434df21e 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> @@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
> VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
> VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
> +VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVcN4v_atan)
>
> #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> index cf9f52faf0..f7aaf8159e 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> @@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
> VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
> VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
> +VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVeN8v_atan)
>
> #ifndef __ILP32__
> # define VEC_INT_TYPE __m512i
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx.c
> new file mode 100644
> index 0000000000..9251c65f8a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-atanf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx2.c
> new file mode 100644
> index 0000000000..9251c65f8a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx2.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-atanf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx512f.c
> new file mode 100644
> index 0000000000..9251c65f8a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-atanf-avx512f.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-atanf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-atanf.c b/sysdeps/x86_64/fpu/test-float-libmvec-atanf.c
> new file mode 100644
> index 0000000000..2a8ab87e86
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-atanf.c
> @@ -0,0 +1,3 @@
> +#define LIBMVEC_TYPE float
> +#define LIBMVEC_FUNC atanf
> +#include "test-vector-abi-arg1.h"
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> index abbd3ed870..af769c56fa 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> @@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
> VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
> VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
> +VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVeN16v_atanf)
>
> #define VEC_INT_TYPE __m512i
>
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> index 8a24027952..76e61d2f1e 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> @@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
> VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
> VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
> +VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVbN4v_atanf)
>
> #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> index aff0442606..5e27eaaf29 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> @@ -31,6 +31,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
> VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
> VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
> +VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVdN8v_atanf)
>
> /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
> #undef VECTOR_WRAPPER_fFF
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> index 913584d111..28daf79aa9 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> @@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
> VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
> VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
> VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
> +VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVcN8v_atanf)
>
> #define VEC_INT_TYPE __m128i
>
> --
> 2.31.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
H.J.
@@ -109,4 +109,15 @@
#define __DECL_SIMD_acosf32x
#define __DECL_SIMD_acosf64x
#define __DECL_SIMD_acosf128x
+
+#define __DECL_SIMD_atan
+#define __DECL_SIMD_atanf
+#define __DECL_SIMD_atanl
+#define __DECL_SIMD_atanf16
+#define __DECL_SIMD_atanf32
+#define __DECL_SIMD_atanf64
+#define __DECL_SIMD_atanf128
+#define __DECL_SIMD_atanf32x
+#define __DECL_SIMD_atanf64x
+#define __DECL_SIMD_atanf128x
#endif
@@ -54,7 +54,7 @@ __MATHCALL_VEC (acos,, (_Mdouble_ __x));
/* Arc sine of X. */
__MATHCALL (asin,, (_Mdouble_ __x));
/* Arc tangent of X. */
-__MATHCALL (atan,, (_Mdouble_ __x));
+__MATHCALL_VEC (atan,, (_Mdouble_ __x));
/* Arc tangent of Y/X. */
__MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x));
@@ -47,10 +47,18 @@ GLIBC_2.22 _ZGVeN8v_sin F
GLIBC_2.22 _ZGVeN8vv_pow F
GLIBC_2.22 _ZGVeN8vvv_sincos F
GLIBC_2.35 _ZGVbN2v_acos F
+GLIBC_2.35 _ZGVbN2v_atan F
GLIBC_2.35 _ZGVbN4v_acosf F
+GLIBC_2.35 _ZGVbN4v_atanf F
GLIBC_2.35 _ZGVcN4v_acos F
+GLIBC_2.35 _ZGVcN4v_atan F
GLIBC_2.35 _ZGVcN8v_acosf F
+GLIBC_2.35 _ZGVcN8v_atanf F
GLIBC_2.35 _ZGVdN4v_acos F
+GLIBC_2.35 _ZGVdN4v_atan F
GLIBC_2.35 _ZGVdN8v_acosf F
+GLIBC_2.35 _ZGVdN8v_atanf F
GLIBC_2.35 _ZGVeN16v_acosf F
+GLIBC_2.35 _ZGVeN16v_atanf F
GLIBC_2.35 _ZGVeN8v_acos F
+GLIBC_2.35 _ZGVeN8v_atan F
@@ -62,6 +62,10 @@
# define __DECL_SIMD_acos __DECL_SIMD_x86_64
# undef __DECL_SIMD_acosf
# define __DECL_SIMD_acosf __DECL_SIMD_x86_64
+# undef __DECL_SIMD_atan
+# define __DECL_SIMD_atan __DECL_SIMD_x86_64
+# undef __DECL_SIMD_atanf
+# define __DECL_SIMD_atanf __DECL_SIMD_x86_64
# endif
#endif
@@ -30,6 +30,8 @@
!GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (atan) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (atanf) attributes simd (notinbranch) if('x86_64')
!GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@@ -45,3 +47,5 @@
!GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (atan) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (atanf) attributes simd (notinbranch) if('x32')
@@ -23,6 +23,7 @@ postclean-generated += libmvec.mk
# Define for both math and mathvec directories.
libmvec-funcs = \
acos \
+ atan \
cos \
exp \
log \
@@ -15,6 +15,8 @@ libmvec {
}
GLIBC_2.35 {
_ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
+ _ZGVbN2v_atan; _ZGVcN4v_atan; _ZGVdN4v_atan; _ZGVeN8v_atan;
_ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
+ _ZGVbN4v_atanf; _ZGVcN8v_atanf; _ZGVdN8v_atanf; _ZGVeN16v_atanf;
}
}
@@ -164,6 +164,26 @@ float: 2
float128: 2
ldouble: 1
+Function: "atan_vlen16":
+float: 1
+
+Function: "atan_vlen2":
+double: 1
+
+Function: "atan_vlen4":
+double: 1
+float: 1
+
+Function: "atan_vlen4_avx2":
+double: 1
+
+Function: "atan_vlen8":
+double: 1
+float: 1
+
+Function: "atan_vlen8_avx2":
+float: 1
+
Function: "atanh":
double: 2
float: 2
new file mode 100644
@@ -0,0 +1,20 @@
+/* SSE2 version of vectorized atan, vector length is 2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVbN2v_atan _ZGVbN2v_atan_sse2
+#include "../svml_d_atan2_core.S"
new file mode 100644
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized atan, vector length is 2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVbN2v_atan
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN2v_atan, __GI__ZGVbN2v_atan, __redirect__ZGVbN2v_atan)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,245 @@
+/* Function atan vectorized with SSE4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
+ * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
+ * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
+ * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
+ * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
+ * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
+ *
+ */
+
+/* Offsets for data table __svml_datan_data_internal_avx512
+ */
+#define AbsMask 0
+#define Shifter 16
+#define MaxThreshold 32
+#define MOne 48
+#define One 64
+#define LargeX 80
+#define Zero 96
+#define Tbl_H 112
+#define Tbl_L 368
+#define dIndexMed 624
+#define Pi2 640
+#define Pi2_low 656
+#define coeff 672
+
+#include <sysdep.h>
+
+ .text
+ .section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN2v_atan_sse4)
+ lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx
+ movups __svml_datan_data_internal_avx512(%rip), %xmm4
+ movups Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3
+ andps %xmm0, %xmm4
+ movaps %xmm3, %xmm12
+ movaps %xmm4, %xmm5
+ addpd %xmm4, %xmm12
+ movaps %xmm12, %xmm7
+
+/*
+ * table lookup sequence
+ * VPERMUTE not available
+ */
+ movaps %xmm12, %xmm10
+ subpd %xmm3, %xmm7
+ subpd %xmm7, %xmm5
+ mulpd %xmm4, %xmm7
+ movups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2
+ psllq $3, %xmm10
+
+/* saturate X range */
+ movups LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8
+ pxor %xmm4, %xmm0
+ cmplepd %xmm4, %xmm2
+ addpd One+__svml_datan_data_internal_avx512(%rip), %xmm7
+ minpd %xmm4, %xmm8
+ movups MOne+__svml_datan_data_internal_avx512(%rip), %xmm6
+ movaps %xmm2, %xmm1
+ movaps %xmm2, %xmm9
+ andnps %xmm5, %xmm1
+ andps %xmm2, %xmm6
+ andnps %xmm7, %xmm9
+ andps %xmm2, %xmm8
+ orps %xmm6, %xmm1
+ orps %xmm8, %xmm9
+
+/* R+Rl = DiffX/Y */
+ divpd %xmm9, %xmm1
+ pand .FLT_11(%rip), %xmm10
+
+/* set table value to Pi/2 for large X */
+ movups Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4
+ movd %xmm10, %eax
+ andps %xmm2, %xmm4
+ pshufd $2, %xmm10, %xmm11
+ movaps %xmm2, %xmm10
+
+/* polynomial evaluation */
+ movaps %xmm1, %xmm2
+ mulpd %xmm1, %xmm2
+ movd %xmm11, %edx
+ movups coeff+__svml_datan_data_internal_avx512(%rip), %xmm5
+ movaps %xmm2, %xmm7
+ movups coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6
+ movaps %xmm2, %xmm9
+ mulpd %xmm2, %xmm5
+ mulpd %xmm2, %xmm7
+ addpd coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5
+ mulpd %xmm2, %xmm6
+ mulpd %xmm7, %xmm5
+ addpd coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6
+ mulpd %xmm1, %xmm9
+ addpd %xmm5, %xmm6
+ movups coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8
+ mulpd %xmm2, %xmm8
+ mulpd %xmm6, %xmm7
+ addpd coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8
+ addpd %xmm7, %xmm8
+ mulpd %xmm8, %xmm9
+ movups dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14
+ cmplepd %xmm12, %xmm14
+ addpd %xmm9, %xmm1
+ movslq %eax, %rax
+ movaps %xmm14, %xmm3
+ movslq %edx, %rdx
+ movsd -128(%rax,%rcx), %xmm13
+ movsd (%rcx,%rax), %xmm15
+ movhpd -128(%rdx,%rcx), %xmm13
+ movhpd (%rcx,%rdx), %xmm15
+ andnps %xmm13, %xmm3
+ andps %xmm14, %xmm15
+ orps %xmm15, %xmm3
+ andnps %xmm3, %xmm10
+ orps %xmm4, %xmm10
+ addpd %xmm1, %xmm10
+ pxor %xmm10, %xmm0
+ ret
+
+END(_ZGVbN2v_atan_sse4)
+
+ .section .rodata, "a"
+ .align 16
+
+#ifdef __svml_datan_data_internal_avx512_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(16)) VUINT32 AbsMask[2][2];
+ __declspec(align(16)) VUINT32 Shifter[2][2];
+ __declspec(align(16)) VUINT32 MaxThreshold[2][2];
+ __declspec(align(16)) VUINT32 MOne[2][2];
+ __declspec(align(16)) VUINT32 One[2][2];
+ __declspec(align(16)) VUINT32 LargeX[2][2];
+ __declspec(align(16)) VUINT32 Zero[2][2];
+ __declspec(align(16)) VUINT32 Tbl_H[32][2];
+ __declspec(align(16)) VUINT32 Tbl_L[32][2];
+ __declspec(align(16)) VUINT32 dIndexMed[2][2];
+ __declspec(align(16)) VUINT32 Pi2[2][2];
+ __declspec(align(16)) VUINT32 Pi2_low[2][2];
+ __declspec(align(16)) VUINT32 coeff[6][2][2];
+ } __svml_datan_data_internal_avx512;
+#endif
+__svml_datan_data_internal_avx512:
+ /*== AbsMask ==*/
+ .quad 0x7fffffffffffffff, 0x7fffffffffffffff
+ /*== Shifter ==*/
+ .align 16
+ .quad 0x4318000000000000, 0x4318000000000000
+ /*== MaxThreshold ==*/
+ .align 16
+ .quad 0x401f800000000000, 0x401f800000000000
+ /*== MOne ==*/
+ .align 16
+ .quad 0xbff0000000000000, 0xbff0000000000000
+ /*== One ==*/
+ .align 16
+ .quad 0x3ff0000000000000, 0x3ff0000000000000
+ /*== LargeX ==*/
+ .align 16
+ .quad 0x47f0000000000000, 0x47f0000000000000
+ /*== Zero ==*/
+ .align 16
+ .quad 0x0000000000000000, 0x0000000000000000
+ /*== Tbl_H ==*/
+ .align 16
+ .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
+ .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
+ .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
+ .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
+ .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
+ .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
+ .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
+ .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
+ .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
+ .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
+ .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
+ .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
+ .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
+ .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
+ .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
+ .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
+ /*== Tbl_L ==*/
+ .align 16
+ .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
+ .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
+ .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
+ .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
+ .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
+ .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
+ .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
+ .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
+ .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
+ .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
+ .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
+ .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
+ .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
+ .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
+ .quad 0xbc929c86447928e7, 0xbc8957a7170df016
+ .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
+ /*== dIndexMed ==*/
+ .align 16
+ .quad 0x4318000000000010, 0x4318000000000010
+ /*== Pi2 ==*/
+ .align 16
+ .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
+ /*== Pi2_low ==*/
+ .align 16
+ .quad 0x3c91a62633145c07, 0x3c91a62633145c07
+ /*== coeff6 ==*/
+ .align 16
+ .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
+ .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc
+ .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
+ .quad 0xbfc249248eef04da, 0xbfc249248eef04da
+ .quad 0x3fc999999998741e, 0x3fc999999998741e
+ .quad 0xbfd555555555554d, 0xbfd555555555554d
+ .align 16
+ .type __svml_datan_data_internal_avx512,@object
+ .size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
+ .align 16
+
+.FLT_11:
+ .long 0x00000078,0x00000000,0x00000078,0x00000000
+ .type .FLT_11,@object
+ .size .FLT_11,16
new file mode 100644
@@ -0,0 +1,20 @@
+/* SSE version of vectorized atan, vector length is 4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVdN4v_atan _ZGVdN4v_atan_sse_wrapper
+#include "../svml_d_atan4_core.S"
new file mode 100644
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized atan, vector length is 4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVdN4v_atan
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN4v_atan, __GI__ZGVdN4v_atan, __redirect__ZGVdN4v_atan)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,225 @@
+/* Function atan vectorized with AVX2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
+ * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
+ * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
+ * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
+ * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
+ * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
+ *
+ */
+
+/* Offsets for data table __svml_datan_data_internal_avx512
+ */
+#define AbsMask 0
+#define Shifter 32
+#define MaxThreshold 64
+#define MOne 96
+#define One 128
+#define LargeX 160
+#define Zero 192
+#define Tbl_H 224
+#define Tbl_L 480
+#define dIndexMed 736
+#define Pi2 768
+#define Pi2_low 800
+#define coeff 832
+
+#include <sysdep.h>
+
+ .text
+ .section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN4v_atan_avx2)
+ lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rdi
+ vmovupd Shifter+__svml_datan_data_internal_avx512(%rip), %ymm4
+ vmovupd One+__svml_datan_data_internal_avx512(%rip), %ymm9
+
+/* saturate X range */
+ vmovupd LargeX+__svml_datan_data_internal_avx512(%rip), %ymm6
+ vandpd __svml_datan_data_internal_avx512(%rip), %ymm0, %ymm7
+ vaddpd %ymm4, %ymm7, %ymm2
+ vcmpge_oqpd MaxThreshold+__svml_datan_data_internal_avx512(%rip), %ymm7, %ymm3
+ vminpd %ymm7, %ymm6, %ymm10
+ vsubpd %ymm4, %ymm2, %ymm5
+
+/*
+ * table lookup sequence
+ * VPERMUTE not available
+ */
+ vpsllq $3, %ymm2, %ymm13
+ vsubpd %ymm5, %ymm7, %ymm8
+ vcmpge_oqpd dIndexMed+__svml_datan_data_internal_avx512(%rip), %ymm2, %ymm2
+ vfmadd231pd %ymm7, %ymm5, %ymm9
+ vpand .FLT_11(%rip), %ymm13, %ymm14
+ vblendvpd %ymm3, MOne+__svml_datan_data_internal_avx512(%rip), %ymm8, %ymm11
+ vblendvpd %ymm3, %ymm10, %ymm9, %ymm12
+ vxorpd %ymm0, %ymm7, %ymm1
+
+/* R+Rl = DiffX/Y */
+ vdivpd %ymm12, %ymm11, %ymm0
+ vextractf128 $1, %ymm14, %xmm4
+ vmovd %xmm14, %eax
+ vmovd %xmm4, %ecx
+ movslq %eax, %rax
+ vpextrd $2, %xmm14, %edx
+ movslq %ecx, %rcx
+ vpextrd $2, %xmm4, %esi
+ movslq %edx, %rdx
+ movslq %esi, %rsi
+ vmovsd -128(%rax,%rdi), %xmm15
+ vmovsd (%rdi,%rax), %xmm7
+ vmovsd -128(%rcx,%rdi), %xmm5
+ vmovsd (%rdi,%rcx), %xmm9
+ vmovhpd -128(%rdx,%rdi), %xmm15, %xmm15
+ vmovhpd (%rdi,%rdx), %xmm7, %xmm8
+ vmovhpd -128(%rsi,%rdi), %xmm5, %xmm6
+ vmovhpd (%rdi,%rsi), %xmm9, %xmm10
+
+/* polynomial evaluation */
+ vmulpd %ymm0, %ymm0, %ymm5
+ vmulpd %ymm5, %ymm5, %ymm4
+ vinsertf128 $1, %xmm6, %ymm15, %ymm11
+ vinsertf128 $1, %xmm10, %ymm8, %ymm12
+ vblendvpd %ymm2, %ymm12, %ymm11, %ymm13
+ vmovupd coeff+__svml_datan_data_internal_avx512(%rip), %ymm8
+ vmovupd coeff+64+__svml_datan_data_internal_avx512(%rip), %ymm2
+ vmulpd %ymm5, %ymm0, %ymm6
+ vfmadd213pd coeff+32+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm8
+ vfmadd213pd coeff+96+__svml_datan_data_internal_avx512(%rip), %ymm5, %ymm2
+
+/* set table value to Pi/2 for large X */
+ vblendvpd %ymm3, Pi2+__svml_datan_data_internal_avx512(%rip), %ymm13, %ymm7
+ vmovupd coeff+128+__svml_datan_data_internal_avx512(%rip), %ymm3
+ vfmadd213pd %ymm2, %ymm4, %ymm8
+ vfmadd213pd coeff+160+__svml_datan_data_internal_avx512(%rip), %ymm3, %ymm5
+ vfmadd213pd %ymm5, %ymm4, %ymm8
+ vfmadd213pd %ymm0, %ymm6, %ymm8
+ vaddpd %ymm8, %ymm7, %ymm0
+ vxorpd %ymm1, %ymm0, %ymm0
+ ret
+
+END(_ZGVdN4v_atan_avx2)
+
+ .section .rodata, "a"
+ .align 32
+
+.FLT_11:
+ .long 0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000,0x00000078,0x00000000
+ .type .FLT_11,@object
+ .size .FLT_11,32
+ .align 32
+
+#ifdef __svml_datan_data_internal_avx512_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(32)) VUINT32 AbsMask[4][2];
+ __declspec(align(32)) VUINT32 Shifter[4][2];
+ __declspec(align(32)) VUINT32 MaxThreshold[4][2];
+ __declspec(align(32)) VUINT32 MOne[4][2];
+ __declspec(align(32)) VUINT32 One[4][2];
+ __declspec(align(32)) VUINT32 LargeX[4][2];
+ __declspec(align(32)) VUINT32 Zero[4][2];
+ __declspec(align(32)) VUINT32 Tbl_H[32][2];
+ __declspec(align(32)) VUINT32 Tbl_L[32][2];
+ __declspec(align(32)) VUINT32 dIndexMed[4][2];
+ __declspec(align(32)) VUINT32 Pi2[4][2];
+ __declspec(align(32)) VUINT32 Pi2_low[4][2];
+ __declspec(align(32)) VUINT32 coeff[6][4][2];
+ } __svml_datan_data_internal_avx512;
+#endif
+__svml_datan_data_internal_avx512:
+ /*== AbsMask ==*/
+ .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
+ /*== Shifter ==*/
+ .align 32
+ .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
+ /*== MaxThreshold ==*/
+ .align 32
+ .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
+ /*== MOne ==*/
+ .align 32
+ .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+ /*== One ==*/
+ .align 32
+ .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
+ /*== LargeX ==*/
+ .align 32
+ .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
+ /*== Zero ==*/
+ .align 32
+ .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ /*== Tbl_H ==*/
+ .align 32
+ .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
+ .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
+ .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
+ .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
+ .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
+ .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
+ .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
+ .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
+ .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
+ .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
+ .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
+ .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
+ .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
+ .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
+ .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
+ .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
+ /*== Tbl_L ==*/
+ .align 32
+ .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
+ .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
+ .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
+ .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
+ .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
+ .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
+ .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
+ .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
+ .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
+ .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
+ .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
+ .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
+ .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
+ .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
+ .quad 0xbc929c86447928e7, 0xbc8957a7170df016
+ .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
+ /*== dIndexMed ==*/
+ .align 32
+ .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
+ /*== Pi2 ==*/
+ .align 32
+ .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+ /*== Pi2_low ==*/
+ .align 32
+ .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
+ /*== coeff6 ==*/
+ .align 32
+ .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
+ .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
+ .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
+ .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
+ .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
+ .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
+ .align 32
+ .type __svml_datan_data_internal_avx512,@object
+ .size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
new file mode 100644
@@ -0,0 +1,20 @@
+/* AVX2 version of vectorized atan, vector length is 8.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVeN8v_atan _ZGVeN8v_atan_avx2_wrapper
+#include "../svml_d_atan8_core.S"
new file mode 100644
@@ -0,0 +1,27 @@
+/* Multiple versions of vectorized atan, vector length is 8.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVeN8v_atan
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN8v_atan, __GI__ZGVeN8v_atan, __redirect__ZGVeN8v_atan)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,213 @@
+/* Function atan vectorized with AVX-512.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
+ * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
+ * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
+ * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
+ * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
+ * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
+ *
+ */
+
+/* Offsets for data table __svml_datan_data_internal_avx512
+ */
+#define AbsMask 0
+#define Shifter 64
+#define MaxThreshold 128
+#define MOne 192
+#define One 256
+#define LargeX 320
+#define Zero 384
+#define Tbl_H 448
+#define dIndexMed 704
+#define Pi2 768
+#define coeff_1 832
+#define coeff_2 896
+#define coeff_3 960
+#define coeff_4 1024
+#define coeff_5 1088
+#define coeff_6 1152
+
+#include <sysdep.h>
+
+ .text
+ .section .text.evex512,"ax",@progbits
+ENTRY(_ZGVeN8v_atan_skx)
+ vmovups Shifter+__svml_datan_data_internal_avx512(%rip), %zmm4
+ vmovups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %zmm3
+ vmovups One+__svml_datan_data_internal_avx512(%rip), %zmm9
+
+/* saturate X range */
+ vmovups LargeX+__svml_datan_data_internal_avx512(%rip), %zmm7
+ vandpd __svml_datan_data_internal_avx512(%rip), %zmm0, %zmm8
+
+/* R+Rl = DiffX/Y */
+ vbroadcastsd .FLT_10(%rip), %zmm15
+ vaddpd {rn-sae}, %zmm4, %zmm8, %zmm2
+ vxorpd %zmm0, %zmm8, %zmm1
+ vcmppd $29, {sae}, %zmm3, %zmm8, %k2
+
+/* round to 2 bits after binary point */
+ vreducepd $40, {sae}, %zmm8, %zmm6
+ vsubpd {rn-sae}, %zmm4, %zmm2, %zmm5
+
+/*
+ * if|X|>=MaxThreshold, set DiffX=-1
+ * VMSUB(D, DiffX, LargeMask, Zero, One);
+ */
+ vblendmpd MOne+__svml_datan_data_internal_avx512(%rip), %zmm6, %zmm10{%k2}
+ vfmadd231pd {rn-sae}, %zmm8, %zmm5, %zmm9
+ vmovups dIndexMed+__svml_datan_data_internal_avx512(%rip), %zmm5
+
+/* table lookup sequence */
+ vmovups Tbl_H+__svml_datan_data_internal_avx512(%rip), %zmm6
+ vgetmantpd $0, {sae}, %zmm10, %zmm14
+ vgetexppd {sae}, %zmm10, %zmm11
+ vmovups coeff_5+__svml_datan_data_internal_avx512(%rip), %zmm10
+
+/*
+ * if|X|>=MaxThreshold, set Y=X
+ * VMADD(D, Y, LargeMask, X, Zero);
+ */
+ vminpd {sae}, %zmm8, %zmm7, %zmm9{%k2}
+ vcmppd $29, {sae}, %zmm5, %zmm2, %k1
+ vmovups Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %zmm7
+ vmovups coeff_1+__svml_datan_data_internal_avx512(%rip), %zmm8
+ vgetmantpd $0, {sae}, %zmm9, %zmm3
+ vgetexppd {sae}, %zmm9, %zmm12
+ vmovups coeff_3+__svml_datan_data_internal_avx512(%rip), %zmm9
+ vpermt2pd Tbl_H+64+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm6
+ vsubpd {rn-sae}, %zmm12, %zmm11, %zmm4
+ vpermt2pd Tbl_H+192+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm7
+ vrcp14pd %zmm3, %zmm13
+ vmovups coeff_4+__svml_datan_data_internal_avx512(%rip), %zmm12
+ vmovups coeff_6+__svml_datan_data_internal_avx512(%rip), %zmm11
+ vblendmpd %zmm7, %zmm6, %zmm2{%k1}
+ vmulpd {rn-sae}, %zmm13, %zmm14, %zmm0
+ vfnmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
+ vfnmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm3
+ vfmadd213pd {rn-sae}, %zmm15, %zmm15, %zmm15
+ vfmadd213pd {rn-sae}, %zmm13, %zmm13, %zmm15
+ vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm3
+ vscalefpd {rn-sae}, %zmm4, %zmm3, %zmm0
+
+/* set table value to Pi/2 for large X */
+ vblendmpd Pi2+__svml_datan_data_internal_avx512(%rip), %zmm2, %zmm3{%k2}
+ vmovups coeff_2+__svml_datan_data_internal_avx512(%rip), %zmm2
+
+/* polynomial evaluation */
+ vmulpd {rn-sae}, %zmm0, %zmm0, %zmm14
+ vmulpd {rn-sae}, %zmm14, %zmm14, %zmm13
+ vmulpd {rn-sae}, %zmm0, %zmm14, %zmm15
+ vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm2
+ vfmadd231pd {rn-sae}, %zmm14, %zmm9, %zmm12
+ vfmadd213pd {rn-sae}, %zmm11, %zmm10, %zmm14
+ vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm2
+ vfmadd213pd {rn-sae}, %zmm14, %zmm13, %zmm2
+ vfmadd213pd {rn-sae}, %zmm0, %zmm15, %zmm2
+ vaddpd {rn-sae}, %zmm3, %zmm2, %zmm0
+ vxorpd %zmm1, %zmm0, %zmm0
+ ret
+
+END(_ZGVeN8v_atan_skx)
+
+ .section .rodata, "a"
+ .align 64
+
+#ifdef __svml_datan_data_internal_avx512_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(64)) VUINT32 AbsMask[8][2];
+ __declspec(align(64)) VUINT32 Shifter[8][2];
+ __declspec(align(64)) VUINT32 MaxThreshold[8][2];
+ __declspec(align(64)) VUINT32 MOne[8][2];
+ __declspec(align(64)) VUINT32 One[8][2];
+ __declspec(align(64)) VUINT32 LargeX[8][2];
+ __declspec(align(64)) VUINT32 Zero[8][2];
+ __declspec(align(64)) VUINT32 Tbl_H[32][2];
+ __declspec(align(64)) VUINT32 dIndexMed[8][2];
+ __declspec(align(64)) VUINT32 Pi2[8][2];
+ __declspec(align(64)) VUINT32 coeff[6][8][2];
+ } __svml_datan_data_internal_avx512;
+#endif
+__svml_datan_data_internal_avx512:
+ /*== AbsMask ==*/
+ .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
+ /*== Shifter ==*/
+ .align 64
+ .quad 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000, 0x4318000000000000
+ /*== MaxThreshold ==*/
+ .align 64
+ .quad 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000, 0x401f800000000000
+ /*== MOne ==*/
+ .align 64
+ .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+ /*== One ==*/
+ .align 64
+ .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
+ /*== LargeX ==*/
+ .align 64
+ .quad 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000, 0x47f0000000000000
+ /*== Zero ==*/
+ .align 64
+ .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ /*== Tbl_H ==*/
+ .align 64
+ .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
+ .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
+ .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
+ .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
+ .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
+ .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
+ .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
+ .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
+ .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
+ .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
+ .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
+ .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
+ .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
+ .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
+ .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
+ .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
+ /*== dIndexMed ==*/
+ .align 64
+ .quad 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010, 0x4318000000000010
+ /*== Pi2 ==*/
+ .align 64
+ .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+ /*== coeff6 ==*/
+ .align 64
+ .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
+ .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc, 0xbfb74257c46790cc
+ .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
+ .quad 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da, 0xbfc249248eef04da
+ .quad 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e, 0x3fc999999998741e
+ .quad 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d, 0xbfd555555555554d
+ .align 64
+ .type __svml_datan_data_internal_avx512,@object
+ .size __svml_datan_data_internal_avx512,.-__svml_datan_data_internal_avx512
+ .align 8
+
+.FLT_10:
+ .long 0x00000000,0x3ff00000
+ .type .FLT_10,@object
+ .size .FLT_10,8
new file mode 100644
@@ -0,0 +1,20 @@
+/* AVX2 version of vectorized atanf.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVeN16v_atanf _ZGVeN16v_atanf_avx2_wrapper
+#include "../svml_s_atanf16_core.S"
new file mode 100644
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized atanf, vector length is 16.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVeN16v_atanf
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN16v_atanf, __GI__ZGVeN16v_atanf,
+ __redirect__ZGVeN16v_atanf)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,174 @@
+/* Function atanf vectorized with AVX-512.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
+ * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
+ * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
+ * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
+ * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
+ * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
+ *
+ */
+
+/* Offsets for data table __svml_satan_data_internal_avx512
+ */
+#define AbsMask 0
+#define Shifter 64
+#define MaxThreshold 128
+#define MOne 192
+#define One 256
+#define LargeX 320
+#define Zero 384
+#define Tbl_H 448
+#define Pi2 576
+#define coeff_1 640
+#define coeff_2 704
+#define coeff_3 768
+
+#include <sysdep.h>
+
+ .text
+ .section .text.exex512,"ax",@progbits
+ENTRY(_ZGVeN16v_atanf_skx)
+ vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
+ vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
+ vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
+
+/* round to 2 bits after binary point */
+ vreduceps $40, {sae}, %zmm7, %zmm5
+
+/* saturate X range */
+ vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
+ vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
+ vcmpps $29, {sae}, %zmm3, %zmm7, %k1
+
+/* table lookup sequence */
+ vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
+ vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
+ vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
+ vxorps %zmm0, %zmm7, %zmm0
+ vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
+ vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
+
+/* if|X|>=MaxThreshold, set DiffX=-1 */
+ vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
+ vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
+
+/* if|X|>=MaxThreshold, set Y=X */
+ vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
+
+/* R+Rl = DiffX/Y */
+ vgetmantps $0, {sae}, %zmm9, %zmm12
+ vgetexpps {sae}, %zmm9, %zmm10
+ vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
+ vgetmantps $0, {sae}, %zmm8, %zmm15
+ vgetexpps {sae}, %zmm8, %zmm11
+ vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
+
+/* set table value to Pi/2 for large X */
+ vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
+ vrcp14ps %zmm15, %zmm13
+ vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
+ vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
+ vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
+ vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
+ vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
+
+/* polynomial evaluation */
+ vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
+ vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
+ vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
+ vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
+ vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
+ vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
+ vxorps %zmm0, %zmm10, %zmm0
+ ret
+
+END(_ZGVeN16v_atanf_skx)
+
+ .section .rodata, "a"
+ .align 64
+
+#ifdef __svml_satan_data_internal_avx512_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(64)) VUINT32 AbsMask[16][1];
+ __declspec(align(64)) VUINT32 Shifter[16][1];
+ __declspec(align(64)) VUINT32 MaxThreshold[16][1];
+ __declspec(align(64)) VUINT32 MOne[16][1];
+ __declspec(align(64)) VUINT32 One[16][1];
+ __declspec(align(64)) VUINT32 LargeX[16][1];
+ __declspec(align(64)) VUINT32 Zero[16][1];
+ __declspec(align(64)) VUINT32 Tbl_H[32][1];
+ __declspec(align(64)) VUINT32 Pi2[16][1];
+ __declspec(align(64)) VUINT32 coeff[3][16][1];
+ } __svml_satan_data_internal_avx512;
+#endif
+__svml_satan_data_internal_avx512:
+ /*== AbsMask ==*/
+ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+ /*== Shifter ==*/
+ .align 64
+ .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
+ /*== MaxThreshold ==*/
+ .align 64
+ .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
+ /*== MOne ==*/
+ .align 64
+ .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+ /*== One ==*/
+ .align 64
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+ /*== LargeX ==*/
+ .align 64
+ .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
+ /*== Zero ==*/
+ .align 64
+ .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
+ /*== Tbl_H ==*/
+ .align 64
+ .long 0x00000000, 0x3e7adbb0
+ .long 0x3eed6338, 0x3f24bc7d
+ .long 0x3f490fdb, 0x3f6563e3
+ .long 0x3f7b985f, 0x3f869c79
+ .long 0x3f8db70d, 0x3f93877b
+ .long 0x3f985b6c, 0x3f9c6b53
+ .long 0x3f9fe0bb, 0x3fa2daa4
+ .long 0x3fa57088, 0x3fa7b46f
+ .long 0x3fa9b465, 0x3fab7b7a
+ .long 0x3fad1283, 0x3fae809e
+ .long 0x3fafcb99, 0x3fb0f836
+ .long 0x3fb20a6a, 0x3fb30581
+ .long 0x3fb3ec43, 0x3fb4c10a
+ .long 0x3fb585d7, 0x3fb63c64
+ .long 0x3fb6e62c, 0x3fb78478
+ .long 0x3fb81868, 0x3fb8a2f5
+ /*== Pi2 ==*/
+ .align 64
+ .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+ /*== coeff3 ==*/
+ .align 64
+ .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
+ .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
+ .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
+ .align 64
+ .type __svml_satan_data_internal_avx512,@object
+ .size __svml_satan_data_internal_avx512,.-__svml_satan_data_internal_avx512
new file mode 100644
@@ -0,0 +1,20 @@
+/* SSE2 version of vectorized atanf, vector length is 4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVbN4v_atanf _ZGVbN4v_atanf_sse2
+#include "../svml_s_atanf4_core.S"
new file mode 100644
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized atanf, vector length is 4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVbN4v_atanf
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN4v_atanf, __GI__ZGVbN4v_atanf,
+ __redirect__ZGVbN4v_atanf)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,164 @@
+/* Function atanf vectorized with SSE4.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
+ * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
+ * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
+ * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
+ * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
+ * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
+ *
+ */
+
+/* Offsets for data table __svml_satan_data_internal
+ */
+#define _sSIGN_MASK 0
+#define _sABS_MASK 16
+#define _sONE 32
+#define _sPIO2 48
+#define _sPC8 64
+#define _sPC7 80
+#define _sPC6 96
+#define _sPC5 112
+#define _sPC4 128
+#define _sPC3 144
+#define _sPC2 160
+#define _sPC1 176
+#define _sPC0 192
+
+#include <sysdep.h>
+
+ .text
+ .section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN4v_atanf_sse4)
+/*
+ * To use minps\maxps operations for argument reduction
+ * uncomment _AT_USEMINMAX_ definition
+ * Declarations
+ * Variables
+ * Constants
+ */
+ movups _sABS_MASK+__svml_satan_data_internal(%rip), %xmm2
+
+/*
+ * 1) If x>1, then r=-1/x, PIO2=Pi/2
+ * 2) If -1<=x<=1, then r=x, PIO2=0
+ * 3) If x<-1, then r=-1/x, PIO2=-Pi/2
+ */
+ movups _sONE+__svml_satan_data_internal(%rip), %xmm1
+ andps %xmm0, %xmm2
+ movaps %xmm2, %xmm9
+ movaps %xmm1, %xmm3
+ cmpleps %xmm1, %xmm9
+ maxps %xmm2, %xmm3
+ minps %xmm2, %xmm1
+ divps %xmm3, %xmm1
+ movups __svml_satan_data_internal(%rip), %xmm4
+ movaps %xmm9, %xmm10
+ andps %xmm4, %xmm0
+ andnps %xmm4, %xmm9
+ pxor %xmm0, %xmm9
+ pxor %xmm1, %xmm9
+
+/* Polynomial. */
+ movaps %xmm9, %xmm8
+ mulps %xmm9, %xmm8
+ movaps %xmm8, %xmm7
+ mulps %xmm8, %xmm7
+ movups _sPC8+__svml_satan_data_internal(%rip), %xmm6
+ mulps %xmm7, %xmm6
+ movups _sPC7+__svml_satan_data_internal(%rip), %xmm5
+ mulps %xmm7, %xmm5
+ addps _sPC6+__svml_satan_data_internal(%rip), %xmm6
+ mulps %xmm7, %xmm6
+ addps _sPC5+__svml_satan_data_internal(%rip), %xmm5
+ mulps %xmm7, %xmm5
+ addps _sPC4+__svml_satan_data_internal(%rip), %xmm6
+ mulps %xmm7, %xmm6
+ addps _sPC3+__svml_satan_data_internal(%rip), %xmm5
+ mulps %xmm5, %xmm7
+ addps _sPC2+__svml_satan_data_internal(%rip), %xmm6
+ mulps %xmm8, %xmm6
+ addps _sPC1+__svml_satan_data_internal(%rip), %xmm7
+ andnps _sPIO2+__svml_satan_data_internal(%rip), %xmm10
+ addps %xmm6, %xmm7
+ mulps %xmm7, %xmm8
+ pxor %xmm0, %xmm10
+ addps _sPC0+__svml_satan_data_internal(%rip), %xmm8
+
+/* Reconstruction. */
+ mulps %xmm8, %xmm9
+ addps %xmm9, %xmm10
+ movaps %xmm10, %xmm0
+ ret
+
+END(_ZGVbN4v_atanf_sse4)
+
+ .section .rodata, "a"
+ .align 16
+
+#ifdef __svml_satan_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
+ __declspec(align(16)) VUINT32 _sABS_MASK[4][1];
+ __declspec(align(16)) VUINT32 _sONE[4][1];
+ __declspec(align(16)) VUINT32 _sPIO2[4][1];
+ __declspec(align(16)) VUINT32 _sPC8[4][1];
+ __declspec(align(16)) VUINT32 _sPC7[4][1];
+ __declspec(align(16)) VUINT32 _sPC6[4][1];
+ __declspec(align(16)) VUINT32 _sPC5[4][1];
+ __declspec(align(16)) VUINT32 _sPC4[4][1];
+ __declspec(align(16)) VUINT32 _sPC3[4][1];
+ __declspec(align(16)) VUINT32 _sPC2[4][1];
+ __declspec(align(16)) VUINT32 _sPC1[4][1];
+ __declspec(align(16)) VUINT32 _sPC0[4][1];
+} __svml_satan_data_internal;
+#endif
+__svml_satan_data_internal:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
+ .align 16
+ .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
+ .align 16
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
+ .align 16
+ .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
+ .align 16
+ .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
+ .align 16
+ .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
+ .align 16
+ .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
+ .align 16
+ .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
+ .align 16
+ .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
+ .align 16
+ .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
+ .align 16
+ .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
+ .align 16
+ .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
+ .align 16
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
+ .align 16
+ .type __svml_satan_data_internal,@object
+ .size __svml_satan_data_internal,.-__svml_satan_data_internal
new file mode 100644
@@ -0,0 +1,20 @@
+/* SSE version of vectorized atanf, vector length is 8.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define _ZGVdN8v_atanf _ZGVdN8v_atanf_sse_wrapper
+#include "../svml_s_atanf8_core.S"
new file mode 100644
@@ -0,0 +1,28 @@
+/* Multiple versions of vectorized atanf, vector length is 8.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define SYMBOL_NAME _ZGVdN8v_atanf
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN8v_atanf, __GI__ZGVdN8v_atanf,
+ __redirect__ZGVdN8v_atanf)
+ __attribute__ ((visibility ("hidden")));
+#endif
new file mode 100644
@@ -0,0 +1,148 @@
+/* Function atanf vectorized with AVX2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ https://www.gnu.org/licenses/. */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
+ * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
+ * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
+ * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
+ * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
+ * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
+ *
+ */
+
+/* Offsets for data table __svml_satan_data_internal
+ */
+#define _sSIGN_MASK 0
+#define _sABS_MASK 32
+#define _sONE 64
+#define _sPIO2 96
+#define _sPC8 128
+#define _sPC7 160
+#define _sPC6 192
+#define _sPC5 224
+#define _sPC4 256
+#define _sPC3 288
+#define _sPC2 320
+#define _sPC1 352
+#define _sPC0 384
+
+#include <sysdep.h>
+
+ .text
+ .section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN8v_atanf_avx2)
+/*
+ * 1) If x>1, then r=-1/x, PIO2=Pi/2
+ * 2) If -1<=x<=1, then r=x, PIO2=0
+ * 3) If x<-1, then r=-1/x, PIO2=-Pi/2
+ */
+ vmovups _sONE+__svml_satan_data_internal(%rip), %ymm2
+ vmovups __svml_satan_data_internal(%rip), %ymm7
+ vmovups _sPC7+__svml_satan_data_internal(%rip), %ymm13
+
+/*
+ * To use minps\maxps operations for argument reduction
+ * uncomment _AT_USEMINMAX_ definition
+ * Declarations
+ * Variables
+ * Constants
+ */
+ vandps _sABS_MASK+__svml_satan_data_internal(%rip), %ymm0, %ymm3
+ vmaxps %ymm3, %ymm2, %ymm5
+ vminps %ymm3, %ymm2, %ymm4
+ vcmple_oqps %ymm2, %ymm3, %ymm6
+ vdivps %ymm5, %ymm4, %ymm11
+ vandps %ymm7, %ymm0, %ymm9
+ vandnps %ymm7, %ymm6, %ymm8
+ vxorps %ymm9, %ymm8, %ymm10
+ vxorps %ymm11, %ymm10, %ymm15
+
+/* Polynomial. */
+ vmulps %ymm15, %ymm15, %ymm14
+ vmovups _sPC8+__svml_satan_data_internal(%rip), %ymm0
+ vmulps %ymm14, %ymm14, %ymm12
+ vfmadd213ps _sPC6+__svml_satan_data_internal(%rip), %ymm12, %ymm0
+ vfmadd213ps _sPC5+__svml_satan_data_internal(%rip), %ymm12, %ymm13
+ vfmadd213ps _sPC4+__svml_satan_data_internal(%rip), %ymm12, %ymm0
+ vfmadd213ps _sPC3+__svml_satan_data_internal(%rip), %ymm12, %ymm13
+ vfmadd213ps _sPC2+__svml_satan_data_internal(%rip), %ymm12, %ymm0
+ vfmadd213ps _sPC1+__svml_satan_data_internal(%rip), %ymm12, %ymm13
+ vfmadd213ps %ymm13, %ymm14, %ymm0
+ vfmadd213ps _sPC0+__svml_satan_data_internal(%rip), %ymm14, %ymm0
+ vandnps _sPIO2+__svml_satan_data_internal(%rip), %ymm6, %ymm1
+ vxorps %ymm9, %ymm1, %ymm1
+
+/* Reconstruction. */
+ vfmadd213ps %ymm1, %ymm15, %ymm0
+ ret
+
+END(_ZGVdN8v_atanf_avx2)
+
+ .section .rodata, "a"
+ .align 32
+
+#ifdef __svml_satan_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+ __declspec(align(32)) VUINT32 _sSIGN_MASK[8][1];
+ __declspec(align(32)) VUINT32 _sABS_MASK[8][1];
+ __declspec(align(32)) VUINT32 _sONE[8][1];
+ __declspec(align(32)) VUINT32 _sPIO2[8][1];
+ __declspec(align(32)) VUINT32 _sPC8[8][1];
+ __declspec(align(32)) VUINT32 _sPC7[8][1];
+ __declspec(align(32)) VUINT32 _sPC6[8][1];
+ __declspec(align(32)) VUINT32 _sPC5[8][1];
+ __declspec(align(32)) VUINT32 _sPC4[8][1];
+ __declspec(align(32)) VUINT32 _sPC3[8][1];
+ __declspec(align(32)) VUINT32 _sPC2[8][1];
+ __declspec(align(32)) VUINT32 _sPC1[8][1];
+ __declspec(align(32)) VUINT32 _sPC0[8][1];
+} __svml_satan_data_internal;
+#endif
+__svml_satan_data_internal:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 //_sSIGN_MASK
+ .align 32
+ .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF //_sABS_MASK
+ .align 32
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sONE
+ .align 32
+ .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB //_sPIO2
+ .align 32
+ .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 //_sPC8
+ .align 32
+ .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 //_sPC7
+ .align 32
+ .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 //_sPC6
+ .align 32
+ .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 //_sPC5
+ .align 32
+ .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 //_sPC4
+ .align 32
+ .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 //_sPC3
+ .align 32
+ .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F //_sPC2
+ .align 32
+ .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 //_sPC1
+ .align 32
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 //_sPC0
+ .align 32
+ .type __svml_satan_data_internal,@object
+ .size __svml_satan_data_internal,.-__svml_satan_data_internal
new file mode 100644
@@ -0,0 +1,29 @@
+/* Function atan vectorized with SSE2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVbN2v_atan)
+WRAPPER_IMPL_SSE2 atan
+END (_ZGVbN2v_atan)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_atan)
+#endif
new file mode 100644
@@ -0,0 +1,29 @@
+/* Function atan vectorized with AVX2, wrapper version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVdN4v_atan)
+WRAPPER_IMPL_AVX _ZGVbN2v_atan
+END (_ZGVdN4v_atan)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_atan)
+#endif
new file mode 100644
@@ -0,0 +1,25 @@
+/* Function atan vectorized in AVX ISA as wrapper to SSE4 ISA version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVcN4v_atan)
+WRAPPER_IMPL_AVX _ZGVbN2v_atan
+END (_ZGVcN4v_atan)
new file mode 100644
@@ -0,0 +1,25 @@
+/* Function atan vectorized with AVX-512, wrapper to AVX2.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN8v_atan)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_atan
+END (_ZGVeN8v_atan)
new file mode 100644
@@ -0,0 +1,25 @@
+/* Function atanf vectorized with AVX-512. Wrapper to AVX2 version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN16v_atanf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_atanf
+END (_ZGVeN16v_atanf)
new file mode 100644
@@ -0,0 +1,29 @@
+/* Function atanf vectorized with SSE2, wrapper version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVbN4v_atanf)
+WRAPPER_IMPL_SSE2 atanf
+END (_ZGVbN4v_atanf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_atanf)
+#endif
new file mode 100644
@@ -0,0 +1,29 @@
+/* Function atanf vectorized with AVX2, wrapper version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVdN8v_atanf)
+WRAPPER_IMPL_AVX _ZGVbN4v_atanf
+END (_ZGVdN8v_atanf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_atanf)
+#endif
new file mode 100644
@@ -0,0 +1,25 @@
+/* Function atanf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVcN8v_atanf)
+WRAPPER_IMPL_AVX _ZGVbN4v_atanf
+END (_ZGVcN8v_atanf)
new file mode 100644
@@ -0,0 +1 @@
+#include "test-double-libmvec-atan.c"
new file mode 100644
@@ -0,0 +1 @@
+#include "test-double-libmvec-atan.c"
new file mode 100644
@@ -0,0 +1 @@
+#include "test-double-libmvec-atan.c"
new file mode 100644
@@ -0,0 +1,3 @@
+#define LIBMVEC_TYPE double
+#define LIBMVEC_FUNC atan
+#include "test-vector-abi-arg1.h"
@@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
+VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVbN2v_atan)
#define VEC_INT_TYPE __m128i
@@ -31,6 +31,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
+VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVdN4v_atan)
#ifndef __ILP32__
# define VEC_INT_TYPE __m256i
@@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
+VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVcN4v_atan)
#define VEC_INT_TYPE __m128i
@@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
+VECTOR_WRAPPER (WRAPPER_NAME (atan), _ZGVeN8v_atan)
#ifndef __ILP32__
# define VEC_INT_TYPE __m512i
new file mode 100644
@@ -0,0 +1 @@
+#include "test-float-libmvec-atanf.c"
new file mode 100644
@@ -0,0 +1 @@
+#include "test-float-libmvec-atanf.c"
new file mode 100644
@@ -0,0 +1 @@
+#include "test-float-libmvec-atanf.c"
new file mode 100644
@@ -0,0 +1,3 @@
+#define LIBMVEC_TYPE float
+#define LIBMVEC_FUNC atanf
+#include "test-vector-abi-arg1.h"
@@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
+VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVeN16v_atanf)
#define VEC_INT_TYPE __m512i
@@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
+VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVbN4v_atanf)
#define VEC_INT_TYPE __m128i
@@ -31,6 +31,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
+VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVdN8v_atanf)
/* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */
#undef VECTOR_WRAPPER_fFF
@@ -28,6 +28,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
+VECTOR_WRAPPER (WRAPPER_NAME (atanf), _ZGVcN8v_atanf)
#define VEC_INT_TYPE __m128i