diff mbox series

[v3,1/1] x86-64: Add vector acos/acosf implementation to libmvec

Message ID 20211215185413.4137536-2-skpgkp2@gmail.com
State Superseded
Headers show
Series Add vector math function acos/acosf to libmvec | expand

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Sunil K Pandey Dec. 15, 2021, 6:54 p.m. UTC
Implement vectorized acos/acosf containing SSE, AVX, AVX2 and AVX512
versions for libmvec as per vector ABI.  It also contains accuracy and
ABI tests for vector acos/acosf with regenerated ulps.
---
 bits/libm-simd-decl-stubs.h                   |  11 +
 math/bits/mathcalls.h                         |   2 +-
 .../unix/sysv/linux/x86_64/libmvec.abilist    |   8 +
 sysdeps/x86/fpu/bits/math-vector.h            |   4 +
 .../x86/fpu/finclude/math-vector-fortran.h    |   4 +
 sysdeps/x86_64/fpu/Makeconfig                 |   1 +
 sysdeps/x86_64/fpu/Versions                   |   4 +
 sysdeps/x86_64/fpu/libm-test-ulps             |  20 +
 .../fpu/multiarch/ifunc-mathvec-avx512-skx.h  |  39 ++
 .../fpu/multiarch/svml_d_acos2_core-sse2.S    |  20 +
 .../x86_64/fpu/multiarch/svml_d_acos2_core.c  |  27 ++
 .../fpu/multiarch/svml_d_acos2_core_sse4.S    | 399 ++++++++++++++++++
 .../fpu/multiarch/svml_d_acos4_core-sse.S     |  20 +
 .../x86_64/fpu/multiarch/svml_d_acos4_core.c  |  27 ++
 .../fpu/multiarch/svml_d_acos4_core_avx2.S    | 368 ++++++++++++++++
 .../fpu/multiarch/svml_d_acos8_core-avx2.S    |  20 +
 .../x86_64/fpu/multiarch/svml_d_acos8_core.c  |  27 ++
 .../fpu/multiarch/svml_d_acos8_core_avx512.S  | 386 +++++++++++++++++
 .../fpu/multiarch/svml_s_acosf16_core-avx2.S  |  20 +
 .../fpu/multiarch/svml_s_acosf16_core.c       |  28 ++
 .../multiarch/svml_s_acosf16_core_avx512.S    | 332 +++++++++++++++
 .../fpu/multiarch/svml_s_acosf4_core-sse2.S   |  20 +
 .../x86_64/fpu/multiarch/svml_s_acosf4_core.c |  28 ++
 .../fpu/multiarch/svml_s_acosf4_core_sse4.S   | 351 +++++++++++++++
 .../fpu/multiarch/svml_s_acosf8_core-sse.S    |  20 +
 .../x86_64/fpu/multiarch/svml_s_acosf8_core.c |  28 ++
 .../fpu/multiarch/svml_s_acosf8_core_avx2.S   | 332 +++++++++++++++
 sysdeps/x86_64/fpu/svml_d_acos2_core.S        |  29 ++
 sysdeps/x86_64/fpu/svml_d_acos4_core.S        |  29 ++
 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S    |  25 ++
 sysdeps/x86_64/fpu/svml_d_acos8_core.S        |  25 ++
 sysdeps/x86_64/fpu/svml_s_acosf16_core.S      |  25 ++
 sysdeps/x86_64/fpu/svml_s_acosf4_core.S       |  29 ++
 sysdeps/x86_64/fpu/svml_s_acosf8_core.S       |  29 ++
 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S   |  25 ++
 .../x86_64/fpu/test-double-libmvec-acos-avx.c |   1 +
 .../fpu/test-double-libmvec-acos-avx2.c       |   1 +
 .../fpu/test-double-libmvec-acos-avx512f.c    |   1 +
 sysdeps/x86_64/fpu/test-double-libmvec-acos.c |   3 +
 .../x86_64/fpu/test-double-vlen2-wrappers.c   |   1 +
 .../fpu/test-double-vlen4-avx2-wrappers.c     |   1 +
 .../x86_64/fpu/test-double-vlen4-wrappers.c   |   1 +
 .../x86_64/fpu/test-double-vlen8-wrappers.c   |   1 +
 .../x86_64/fpu/test-float-libmvec-acosf-avx.c |   1 +
 .../fpu/test-float-libmvec-acosf-avx2.c       |   1 +
 .../fpu/test-float-libmvec-acosf-avx512f.c    |   1 +
 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c |   3 +
 .../x86_64/fpu/test-float-vlen16-wrappers.c   |   1 +
 .../x86_64/fpu/test-float-vlen4-wrappers.c    |   1 +
 .../fpu/test-float-vlen8-avx2-wrappers.c      |   1 +
 .../x86_64/fpu/test-float-vlen8-wrappers.c    |   1 +
 51 files changed, 2781 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
 create mode 100644 sysdeps/x86_64/fpu/svml_d_acos2_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
 create mode 100644 sysdeps/x86_64/fpu/svml_d_acos8_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf16_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf4_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
 create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
 create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
 create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
 create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos.c
 create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
 create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
 create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
 create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c

Comments

Noah Goldstein Dec. 15, 2021, 7:43 p.m. UTC | #1
On Wed, Dec 15, 2021 at 12:55 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Implement vectorized acos/acosf containing SSE, AVX, AVX2 and AVX512
> versions for libmvec as per vector ABI.  It also contains accuracy and
> ABI tests for vector acos/acosf with regenerated ulps.
> ---
>  bits/libm-simd-decl-stubs.h                   |  11 +
>  math/bits/mathcalls.h                         |   2 +-
>  .../unix/sysv/linux/x86_64/libmvec.abilist    |   8 +
>  sysdeps/x86/fpu/bits/math-vector.h            |   4 +
>  .../x86/fpu/finclude/math-vector-fortran.h    |   4 +
>  sysdeps/x86_64/fpu/Makeconfig                 |   1 +
>  sysdeps/x86_64/fpu/Versions                   |   4 +
>  sysdeps/x86_64/fpu/libm-test-ulps             |  20 +
>  .../fpu/multiarch/ifunc-mathvec-avx512-skx.h  |  39 ++
>  .../fpu/multiarch/svml_d_acos2_core-sse2.S    |  20 +
>  .../x86_64/fpu/multiarch/svml_d_acos2_core.c  |  27 ++
>  .../fpu/multiarch/svml_d_acos2_core_sse4.S    | 399 ++++++++++++++++++
>  .../fpu/multiarch/svml_d_acos4_core-sse.S     |  20 +
>  .../x86_64/fpu/multiarch/svml_d_acos4_core.c  |  27 ++
>  .../fpu/multiarch/svml_d_acos4_core_avx2.S    | 368 ++++++++++++++++
>  .../fpu/multiarch/svml_d_acos8_core-avx2.S    |  20 +
>  .../x86_64/fpu/multiarch/svml_d_acos8_core.c  |  27 ++
>  .../fpu/multiarch/svml_d_acos8_core_avx512.S  | 386 +++++++++++++++++
>  .../fpu/multiarch/svml_s_acosf16_core-avx2.S  |  20 +
>  .../fpu/multiarch/svml_s_acosf16_core.c       |  28 ++
>  .../multiarch/svml_s_acosf16_core_avx512.S    | 332 +++++++++++++++
>  .../fpu/multiarch/svml_s_acosf4_core-sse2.S   |  20 +
>  .../x86_64/fpu/multiarch/svml_s_acosf4_core.c |  28 ++
>  .../fpu/multiarch/svml_s_acosf4_core_sse4.S   | 351 +++++++++++++++
>  .../fpu/multiarch/svml_s_acosf8_core-sse.S    |  20 +
>  .../x86_64/fpu/multiarch/svml_s_acosf8_core.c |  28 ++
>  .../fpu/multiarch/svml_s_acosf8_core_avx2.S   | 332 +++++++++++++++
>  sysdeps/x86_64/fpu/svml_d_acos2_core.S        |  29 ++
>  sysdeps/x86_64/fpu/svml_d_acos4_core.S        |  29 ++
>  sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S    |  25 ++
>  sysdeps/x86_64/fpu/svml_d_acos8_core.S        |  25 ++
>  sysdeps/x86_64/fpu/svml_s_acosf16_core.S      |  25 ++
>  sysdeps/x86_64/fpu/svml_s_acosf4_core.S       |  29 ++
>  sysdeps/x86_64/fpu/svml_s_acosf8_core.S       |  29 ++
>  sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S   |  25 ++
>  .../x86_64/fpu/test-double-libmvec-acos-avx.c |   1 +
>  .../fpu/test-double-libmvec-acos-avx2.c       |   1 +
>  .../fpu/test-double-libmvec-acos-avx512f.c    |   1 +
>  sysdeps/x86_64/fpu/test-double-libmvec-acos.c |   3 +
>  .../x86_64/fpu/test-double-vlen2-wrappers.c   |   1 +
>  .../fpu/test-double-vlen4-avx2-wrappers.c     |   1 +
>  .../x86_64/fpu/test-double-vlen4-wrappers.c   |   1 +
>  .../x86_64/fpu/test-double-vlen8-wrappers.c   |   1 +
>  .../x86_64/fpu/test-float-libmvec-acosf-avx.c |   1 +
>  .../fpu/test-float-libmvec-acosf-avx2.c       |   1 +
>  .../fpu/test-float-libmvec-acosf-avx512f.c    |   1 +
>  sysdeps/x86_64/fpu/test-float-libmvec-acosf.c |   3 +
>  .../x86_64/fpu/test-float-vlen16-wrappers.c   |   1 +
>  .../x86_64/fpu/test-float-vlen4-wrappers.c    |   1 +
>  .../fpu/test-float-vlen8-avx2-wrappers.c      |   1 +
>  .../x86_64/fpu/test-float-vlen8-wrappers.c    |   1 +
>  51 files changed, 2781 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_d_acos2_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_d_acos8_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf16_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf4_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
>  create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
>  create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
>  create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
>  create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos.c
>  create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
>  create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
>  create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
>  create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
>
> diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
> index b80ff332a0..2ccdd1fc53 100644
> --- a/bits/libm-simd-decl-stubs.h
> +++ b/bits/libm-simd-decl-stubs.h
> @@ -98,4 +98,15 @@
>  #define __DECL_SIMD_powf32x
>  #define __DECL_SIMD_powf64x
>  #define __DECL_SIMD_powf128x
> +
> +#define __DECL_SIMD_acos
> +#define __DECL_SIMD_acosf
> +#define __DECL_SIMD_acosl
> +#define __DECL_SIMD_acosf16
> +#define __DECL_SIMD_acosf32
> +#define __DECL_SIMD_acosf64
> +#define __DECL_SIMD_acosf128
> +#define __DECL_SIMD_acosf32x
> +#define __DECL_SIMD_acosf64x
> +#define __DECL_SIMD_acosf128x
>  #endif
> diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
> index da4cf4e10c..2cc6654208 100644
> --- a/math/bits/mathcalls.h
> +++ b/math/bits/mathcalls.h
> @@ -50,7 +50,7 @@
>  /* Trigonometric functions.  */
>
>  /* Arc cosine of X.  */
> -__MATHCALL (acos,, (_Mdouble_ __x));
> +__MATHCALL_VEC (acos,, (_Mdouble_ __x));
>  /* Arc sine of X.  */
>  __MATHCALL (asin,, (_Mdouble_ __x));
>  /* Arc tangent of X.  */
> diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> index 363d4ace1e..b37b55777e 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> @@ -46,3 +46,11 @@ GLIBC_2.22 _ZGVeN8v_log F
>  GLIBC_2.22 _ZGVeN8v_sin F
>  GLIBC_2.22 _ZGVeN8vv_pow F
>  GLIBC_2.22 _ZGVeN8vvv_sincos F
> +GLIBC_2.35 _ZGVbN2v_acos F
> +GLIBC_2.35 _ZGVbN4v_acosf F
> +GLIBC_2.35 _ZGVcN4v_acos F
> +GLIBC_2.35 _ZGVcN8v_acosf F
> +GLIBC_2.35 _ZGVdN4v_acos F
> +GLIBC_2.35 _ZGVdN8v_acosf F
> +GLIBC_2.35 _ZGVeN16v_acosf F
> +GLIBC_2.35 _ZGVeN8v_acos F
> diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h
> index dc0bfb3705..dabb74cbb9 100644
> --- a/sysdeps/x86/fpu/bits/math-vector.h
> +++ b/sysdeps/x86/fpu/bits/math-vector.h
> @@ -58,6 +58,10 @@
>  #  define __DECL_SIMD_pow __DECL_SIMD_x86_64
>  #  undef __DECL_SIMD_powf
>  #  define __DECL_SIMD_powf __DECL_SIMD_x86_64
> +#  undef __DECL_SIMD_acos
> +#  define __DECL_SIMD_acos __DECL_SIMD_x86_64
> +#  undef __DECL_SIMD_acosf
> +#  define __DECL_SIMD_acosf __DECL_SIMD_x86_64
>
>  # endif
>  #endif
> diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> index 311bb4e391..4bcbd1fbce 100644
> --- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> +++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> @@ -28,6 +28,8 @@
>  !GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64')
>  !GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64')
>  !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
> +!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
> +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
>
>  !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
>  !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
> @@ -41,3 +43,5 @@
>  !GCC$ builtin (expf) attributes simd (notinbranch) if('x32')
>  !GCC$ builtin (pow) attributes simd (notinbranch) if('x32')
>  !GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
> +!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
> +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
> diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig
> index b0e3bf7887..7acf1f306c 100644
> --- a/sysdeps/x86_64/fpu/Makeconfig
> +++ b/sysdeps/x86_64/fpu/Makeconfig
> @@ -22,6 +22,7 @@ postclean-generated += libmvec.mk
>
>  # Define for both math and mathvec directories.
>  libmvec-funcs = \
> +  acos \
>    cos \
>    exp \
>    log \
> diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
> index 08132045d6..3fd1379b17 100644
> --- a/sysdeps/x86_64/fpu/Versions
> +++ b/sysdeps/x86_64/fpu/Versions
> @@ -12,5 +12,9 @@ libmvec {
>      _ZGVbN4v_expf; _ZGVcN8v_expf; _ZGVdN8v_expf; _ZGVeN16v_expf;
>      _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf;
>      _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf;
> +}
> +  GLIBC_2.35 {
> +    _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
> +    _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
>    }
>  }
> diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
> index 312575f933..85a568ed29 100644
> --- a/sysdeps/x86_64/fpu/libm-test-ulps
> +++ b/sysdeps/x86_64/fpu/libm-test-ulps
> @@ -25,6 +25,26 @@ float: 1
>  float128: 1
>  ldouble: 2
>
> +Function: "acos_vlen16":
> +float: 1
> +
> +Function: "acos_vlen2":
> +double: 1
> +
> +Function: "acos_vlen4":
> +double: 1
> +float: 2
> +
> +Function: "acos_vlen4_avx2":
> +double: 1
> +
> +Function: "acos_vlen8":
> +double: 1
> +float: 2
> +
> +Function: "acos_vlen8_avx2":
> +float: 1
> +
>  Function: "acosh":
>  double: 2
>  float: 2
> diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
> new file mode 100644
> index 0000000000..3aed563dde
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
> @@ -0,0 +1,39 @@
> +/* Common definition for libmathvec ifunc selections optimized with
> +   AVX512.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +#undef PASTER2
> +#define PASTER2(x,y)   x##_##y
> +
> +extern void REDIRECT_NAME (void);
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ))
> +    return OPTIMIZE (skx);
> +
> +  return OPTIMIZE (avx2_wrapper);
> +}
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
> new file mode 100644
> index 0000000000..25fb8d0cac
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
> @@ -0,0 +1,20 @@
> +/* SSE2 version of vectorized acos, vector length is 2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2
> +#include "../svml_d_acos2_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
> new file mode 100644
> index 0000000000..5ba5d6fac2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVbN2v_acos
> +#include "ifunc-mathvec-sse4_1.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
> new file mode 100644
> index 0000000000..0c898e70ab
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
> @@ -0,0 +1,399 @@
> +/* Function acos vectorized with SSE4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define _SgnBit                        0
> +#define _OneHalf                       64
> +#define _SmallNorm                     128
> +#define _dRsqrtMsk                     192
> +#define _MOne                          256
> +#define _HalfMask                      320
> +#define _Two                           384
> +#define _sqrt_coeff                    448
> +#define _poly_coeff                    512
> +#define _PiL                           704
> +#define _PiH                           768
> +#define _Pi2L                          832
> +#define _Pi2H                          896
> +#define _Zero                          960
> +#define _SgnMask                       1024
> +#define _NanMask                       1088
> +#define _ep_coeff                      1152
> +#define _dInfs                         1280
> +#define _dOnes                         1344
> +#define _dZeros                        1408
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.sse4,"ax",@progbits
> +ENTRY(_ZGVbN2v_acos_sse4)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-64, %rsp
> +        subq      $256, %rsp
> +        movups    __svml_dacos_data_internal(%rip), %xmm3
> +        movups    _OneHalf+__svml_dacos_data_internal(%rip), %xmm6
> +
> +/* x = -|arg| */
> +        movaps    %xmm3, %xmm2
> +        orps      %xmm0, %xmm2
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        movaps    %xmm6, %xmm4
> +        mulpd     %xmm2, %xmm4
> +        addpd     %xmm4, %xmm6
> +
> +/* S ~ 2*sqrt(Y) */
> +        cvtpd2ps  %xmm6, %xmm7
> +
> +/* NaN processed in special branch (so wind test passed) */
> +        movups    _MOne+__svml_dacos_data_internal(%rip), %xmm1
> +
> +/* x^2 */
> +        movaps    %xmm2, %xmm5
> +        cmpnlepd  %xmm2, %xmm1
> +        mulpd     %xmm2, %xmm5
> +        movmskpd  %xmm1, %edx
> +        movlhps   %xmm7, %xmm7
> +        andps     %xmm0, %xmm3
> +        movups    %xmm8, 144(%rsp)
> +        rsqrtps   %xmm7, %xmm1
> +        minpd     %xmm6, %xmm5
> +        cvtps2pd  %xmm1, %xmm8
> +        movaps    %xmm6, %xmm1
> +        movaps    %xmm5, %xmm4
> +        cmpltpd   _SmallNorm+__svml_dacos_data_internal(%rip), %xmm1
> +        cmpnltpd  %xmm6, %xmm4
> +        addpd     %xmm6, %xmm6
> +        andnps    %xmm8, %xmm1
> +        movups    %xmm14, 176(%rsp)
> +        movaps    %xmm1, %xmm14
> +        mulpd     %xmm1, %xmm14
> +        mulpd     %xmm6, %xmm1
> +        mulpd     %xmm14, %xmm6
> +        subpd     _Two+__svml_dacos_data_internal(%rip), %xmm6
> +        movups    %xmm15, 160(%rsp)
> +        movaps    %xmm6, %xmm8
> +        movups    _sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm15
> +        mulpd     %xmm6, %xmm15
> +        mulpd     %xmm1, %xmm8
> +        addpd     _sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm15
> +        mulpd     %xmm6, %xmm15
> +        addpd     _sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm15
> +        mulpd     %xmm15, %xmm6
> +        addpd     _sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6
> +        mulpd     %xmm6, %xmm8
> +
> +/* polynomial */
> +        movups    _poly_coeff+__svml_dacos_data_internal(%rip), %xmm6
> +        movaps    %xmm5, %xmm15
> +        mulpd     %xmm5, %xmm6
> +        mulpd     %xmm5, %xmm15
> +        addpd     _poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm6
> +        subpd     %xmm8, %xmm1
> +        mulpd     %xmm15, %xmm6
> +        movups    _poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm14
> +        andps     %xmm4, %xmm1
> +        mulpd     %xmm5, %xmm14
> +        movups    _poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm7
> +        mulpd     %xmm5, %xmm7
> +        addpd     _poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm14
> +        addpd     _poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm7
> +        addpd     %xmm6, %xmm14
> +        mulpd     %xmm15, %xmm7
> +        movups    _poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm8
> +        movaps    %xmm15, %xmm6
> +        mulpd     %xmm5, %xmm8
> +        mulpd     %xmm15, %xmm6
> +        addpd     _poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm8
> +        mulpd     %xmm6, %xmm14
> +        addpd     %xmm7, %xmm8
> +        movups    _poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm7
> +        mulpd     %xmm5, %xmm7
> +        addpd     %xmm14, %xmm8
> +        addpd     _poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm7
> +        mulpd     %xmm15, %xmm8
> +        movups    _poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm6
> +
> +/* X<X^2 iff X<0 */
> +        movaps    %xmm0, %xmm14
> +        addpd     %xmm8, %xmm7
> +        cmpltpd   %xmm5, %xmm14
> +        mulpd     %xmm5, %xmm6
> +        mulpd     %xmm7, %xmm15
> +        addpd     _poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm6
> +        addpd     %xmm15, %xmm6
> +        mulpd     %xmm5, %xmm6
> +        movaps    %xmm4, %xmm7
> +        movaps    %xmm4, %xmm5
> +        andnps    %xmm2, %xmm7
> +        orps      %xmm1, %xmm7
> +        pxor      %xmm3, %xmm7
> +        mulpd     %xmm7, %xmm6
> +        movups    _PiH+__svml_dacos_data_internal(%rip), %xmm8
> +        andps     %xmm4, %xmm8
> +        andnps    _Pi2H+__svml_dacos_data_internal(%rip), %xmm5
> +        andps     %xmm14, %xmm8
> +        addpd     %xmm5, %xmm8
> +        addpd     %xmm6, %xmm7
> +        addpd     %xmm7, %xmm8
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movups    176(%rsp), %xmm14
> +        cfi_restore(31)
> +        movaps    %xmm8, %xmm0
> +        movups    144(%rsp), %xmm8
> +        cfi_restore(25)
> +        movups    160(%rsp), %xmm15
> +        cfi_restore(32)
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        movups    %xmm0, 128(%rsp)
> +        movups    %xmm8, 192(%rsp)
> +        xorl      %eax, %eax
> +        movups    %xmm9, 64(%rsp)
> +        movups    %xmm10, 48(%rsp)
> +        movups    %xmm11, 32(%rsp)
> +        movups    %xmm12, 16(%rsp)
> +        movups    %xmm13, (%rsp)
> +        movq      %rsi, 88(%rsp)
> +        movq      %rdi, 80(%rsp)
> +        movq      %r12, 112(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -144; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x70, 0xff, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 104(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -152; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x68, 0xff, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, 96(%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -160; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x60, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $2, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movups    64(%rsp), %xmm9
> +        cfi_restore(26)
> +        movups    48(%rsp), %xmm10
> +        cfi_restore(27)
> +        movups    32(%rsp), %xmm11
> +        cfi_restore(28)
> +        movups    16(%rsp), %xmm12
> +        cfi_restore(29)
> +        movups    (%rsp), %xmm13
> +        cfi_restore(30)
> +        movq      88(%rsp), %rsi
> +        cfi_restore(4)
> +        movq      80(%rsp), %rdi
> +        cfi_restore(5)
> +        movq      112(%rsp), %r12
> +        cfi_restore(12)
> +        movq      104(%rsp), %r13
> +        cfi_restore(13)
> +        movq      96(%rsp), %r14
> +        cfi_restore(14)
> +        movups    192(%rsp), %xmm8
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -144; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x70, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -152; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x68, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -160; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x60, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movsd     128(%rsp,%r14,8), %xmm0
> +        call      acos@PLT
> +        movsd     %xmm0, 192(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVbN2v_acos_sse4)
> +
> +        .section .rodata, "a"
> +        .align 64
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +    __declspec(align(64)) VUINT32 SgnBit[2][2];
> +    __declspec(align(64)) VUINT32 OneHalf[2][2];
> +    __declspec(align(64)) VUINT32 SmallNorm[2][2];
> +    __declspec(align(64)) VUINT32 dRsqrtMsk[2][2];
> +    __declspec(align(64)) VUINT32 MOne[2][2];
> +    __declspec(align(64)) VUINT32 HalfMask[2][2];
> +    __declspec(align(64)) VUINT32 Two[2][2];
> +    __declspec(align(64)) VUINT32 sqrt_coeff[4][2][2];
> +    __declspec(align(64)) VUINT32 poly_coeff[12][2][2];
> +    __declspec(align(64)) VUINT32 PiL[2][2];
> +    __declspec(align(64)) VUINT32 PiH[2][2];
> +    __declspec(align(64)) VUINT32 Pi2L[2][2];
> +    __declspec(align(64)) VUINT32 Pi2H[2][2];
> +    __declspec(align(64)) VUINT32 Zero[2][2];
> +    __declspec(align(64)) VUINT32 SgnMask[2][2];
> +    __declspec(align(64)) VUINT32 NanMask[2][2];
> +    __declspec(align(64)) VUINT32 ep_coeff[6][2][2];
> +    /* scalar part follow */
> +    __declspec(align(64)) VUINT32 dInfs[2][2];
> +    __declspec(align(64)) VUINT32 dOnes[2][2];
> +    __declspec(align(64)) VUINT32 dZeros[2][2];
> +} __svml_dacos_data_internal_t;
> +#endif
> +__svml_dacos_data_internal:
> +        /*== SgnBit ==*/
> +        .quad 0x8000000000000000, 0x8000000000000000
> +        /*== OneHalf ==*/
> +        .align 64
> +        .quad 0x3fe0000000000000, 0x3fe0000000000000
> +        /*== SmallNorm ==*/
> +        .align 64
> +        .quad 0x3000000000000000, 0x3000000000000000
> +        /*== dRsqrtMsk ==*/
> +        .align 64
> +        .quad 0xffffff0000000000, 0xffffff0000000000
> +        /*== MOne ==*/
> +        .align 64
> +        .quad 0xbff0000000000000, 0xbff0000000000000
> +        /*== HalfMask ==*/
> +        .align 64
> +        .quad 0xfffffffffc000000, 0xfffffffffc000000
> +        /*== Two ==*/
> +        .align 64
> +        .quad 0x4000000000000000, 0x4000000000000000
> +        /*== sqrt_coeff[4] ==*/
> +        .align 64
> +        .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> +        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> +        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> +        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> +        /*== poly_coeff[12] ==*/
> +        .align 64
> +        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> +        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> +        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> +        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> +        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> +        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> +        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> +        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> +        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> +        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> +        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> +        .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> +        /*== PiL ==*/
> +        .align 64
> +        .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07
> +        /*== PiH ==*/
> +        .align 64
> +        .quad 0x400921fb54442d18, 0x400921fb54442d18
> +        /*== Pi2L ==*/
> +        .align 64
> +        .quad 0x3c91a62633145c07, 0x3c91a62633145c07
> +        /*== Pi2H ==*/
> +        .align 64
> +        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
> +        /*== Zero ==*/
> +        .align 64
> +        .quad 0x0000000000000000, 0x0000000000000000
> +        /*== sgn mask ==*/
> +        .align 64
> +        .quad 0x8000000000000000, 0x8000000000000000
> +        /*== NaN mask ==*/
> +        .align 64
> +        .quad 0xfffc000000000000, 0xfffc000000000000
> +        /*== ep_coeff[6] ==*/
> +        .align 64
> +        .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
> +        .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
> +        .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
> +        .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
> +        .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
> +        .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
> +        /* scalar part follows */
> +        /*== dInfs = DP infinity, +/- ==*/
> +        .align 64
> +        .quad 0x7ff0000000000000, 0xfff0000000000000
> +        /*== dOnes = DP one, +/- ==*/
> +        .align 64
> +        .quad 0x3ff0000000000000, 0xbff0000000000000
> +        /*== dZeros = DP zero +/- ==*/
> +        .align 64
> +        .quad 0x0000000000000000, 0x8000000000000000
> +        .align 64
> +        .type  __svml_dacos_data_internal,@object
> +        .size  __svml_dacos_data_internal,1472
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
> new file mode 100644
> index 0000000000..750f71c81c
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
> @@ -0,0 +1,20 @@
> +/* SSE version of vectorized acos, vector length is 4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper
> +#include "../svml_d_acos4_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
> new file mode 100644
> index 0000000000..6453e7ebe2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVdN4v_acos
> +#include "ifunc-mathvec-avx2.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> new file mode 100644
> index 0000000000..684d501a3d
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> @@ -0,0 +1,368 @@
> +/* Function acos vectorized with AVX2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define _SgnBit                        0
> +#define _OneHalf                       64
> +#define _SmallNorm                     128
> +#define _dRsqrtMsk                     192
> +#define _MOne                          256
> +#define _HalfMask                      320
> +#define _Two                           384
> +#define _sqrt_coeff                    448
> +#define _poly_coeff                    576
> +#define _PiL                           960
> +#define _PiH                           1024
> +#define _Pi2L                          1088
> +#define _Pi2H                          1152
> +#define _Zero                          1216
> +#define _SgnMask                       1280
> +#define _NanMask                       1344
> +#define _ep_coeff                      1408
> +#define _dInfs                         1600
> +#define _dOnes                         1664
> +#define _dZeros                        1728
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.avx2,"ax",@progbits
> +ENTRY(_ZGVdN4v_acos_avx2)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-64, %rsp
> +        subq      $384, %rsp
> +        vmovupd   __svml_dacos_data_internal(%rip), %ymm6
> +        vmovupd   _OneHalf+__svml_dacos_data_internal(%rip), %ymm7
> +        vmovups   %ymm8, 96(%rsp)
> +        vmovups   %ymm10, 192(%rsp)
> +        vmovups   %ymm9, 160(%rsp)
> +        vmovups   %ymm11, 224(%rsp)
> +        vmovups   %ymm12, 256(%rsp)
> +        vmovups   %ymm13, 288(%rsp)
> +        vmovups   %ymm15, 352(%rsp)
> +        vmovups   %ymm14, 320(%rsp)
> +        vmovapd   %ymm0, %ymm5
> +
> +/* x = -|arg| */
> +        vorpd     %ymm5, %ymm6, %ymm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        vfmadd231pd %ymm4, %ymm7, %ymm7
> +
> +/* x^2 */
> +        vmulpd    %ymm4, %ymm4, %ymm1
> +
> +/* NaN processed in special branch (so wind test passed) */
> +        vcmpnge_uqpd _MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm0
> +
> +/* S ~ 2*sqrt(Y) */
> +        vcmplt_oqpd _SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm9
> +        vaddpd    %ymm7, %ymm7, %ymm13
> +        vminpd    %ymm7, %ymm1, %ymm2
> +        vandpd    %ymm5, %ymm6, %ymm3
> +        vcvtpd2ps %ymm7, %xmm6
> +        vcmpnlt_uqpd %ymm7, %ymm2, %ymm1
> +        vmovupd   _poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm7
> +        vrsqrtps  %xmm6, %xmm8
> +        vmovupd   _poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm6
> +        vfmadd213pd _poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm7
> +        vcvtps2pd %xmm8, %ymm10
> +        vfmadd213pd _poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm6
> +        vandnpd   %ymm10, %ymm9, %ymm11
> +        vmulpd    %ymm11, %ymm11, %ymm12
> +        vmulpd    %ymm13, %ymm11, %ymm15
> +        vmovupd   _poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm9
> +        vmulpd    %ymm2, %ymm2, %ymm11
> +        vmovupd   _poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm10
> +        vfmsub213pd _Two+__svml_dacos_data_internal(%rip), %ymm12, %ymm13
> +        vmovupd   _poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm12
> +        vfmadd213pd _poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm9
> +        vmulpd    %ymm11, %ymm11, %ymm8
> +        vfmadd213pd _poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm10
> +        vmulpd    %ymm13, %ymm15, %ymm14
> +        vfmadd213pd _poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm12
> +        vfmadd213pd %ymm7, %ymm11, %ymm9
> +        vmovmskpd %ymm0, %edx
> +        vmovupd   _sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0
> +        vfmadd213pd _sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
> +        vfmadd213pd _sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
> +        vfmadd213pd _sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
> +
> +/* polynomial */
> +        vmovupd   _poly_coeff+__svml_dacos_data_internal(%rip), %ymm13
> +        vfnmadd213pd %ymm15, %ymm14, %ymm0
> +        vfmadd213pd _poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm13
> +        vblendvpd %ymm1, %ymm0, %ymm4, %ymm4
> +        vfmadd213pd %ymm6, %ymm11, %ymm13
> +
> +/* X<X^2 iff X<0 */
> +        vcmplt_oqpd %ymm2, %ymm5, %ymm6
> +        vfmadd213pd %ymm9, %ymm8, %ymm13
> +        vfmadd213pd %ymm10, %ymm11, %ymm13
> +        vfmadd213pd %ymm12, %ymm11, %ymm13
> +        vmulpd    %ymm13, %ymm2, %ymm14
> +        vxorpd    %ymm3, %ymm4, %ymm3
> +        vandpd    _PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2
> +        vfmadd213pd %ymm3, %ymm3, %ymm14
> +        vandpd    %ymm6, %ymm2, %ymm2
> +        vandnpd   _Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7
> +        vaddpd    %ymm7, %ymm2, %ymm8
> +        vaddpd    %ymm14, %ymm8, %ymm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        vmovups   96(%rsp), %ymm8
> +        cfi_restore(91)
> +        vmovups   160(%rsp), %ymm9
> +        cfi_restore(92)
> +        vmovups   192(%rsp), %ymm10
> +        cfi_restore(93)
> +        vmovups   224(%rsp), %ymm11
> +        cfi_restore(94)
> +        vmovups   256(%rsp), %ymm12
> +        cfi_restore(95)
> +        vmovups   288(%rsp), %ymm13
> +        cfi_restore(96)
> +        vmovups   320(%rsp), %ymm14
> +        cfi_restore(97)
> +        vmovups   352(%rsp), %ymm15
> +        cfi_restore(98)
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        vmovupd   %ymm5, 64(%rsp)
> +        vmovupd   %ymm0, 128(%rsp)
> +
> +/* Go to exit */
> +        je        L(EXIT)
> +        xorl      %eax, %eax
> +        vzeroupper
> +        movq      %rsi, 8(%rsp)
> +        movq      %rdi, (%rsp)
> +        movq      %r12, 32(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 24(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, 16(%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $4, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movq      8(%rsp), %rsi
> +        cfi_restore(4)
> +        movq      (%rsp), %rdi
> +        cfi_restore(5)
> +        movq      32(%rsp), %r12
> +        cfi_restore(12)
> +        movq      24(%rsp), %r13
> +        cfi_restore(13)
> +        movq      16(%rsp), %r14
> +        cfi_restore(14)
> +        vmovupd   128(%rsp), %ymm0
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movsd     64(%rsp,%r14,8), %xmm0
> +        call      acos@PLT
> +        movsd     %xmm0, 128(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVdN4v_acos_avx2)
> +
> +        .section .rodata, "a"
> +        .align 64
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +    __declspec(align(64)) VUINT32 SgnBit[4][2];
> +    __declspec(align(64)) VUINT32 OneHalf[4][2];
> +    __declspec(align(64)) VUINT32 SmallNorm[4][2];
> +    __declspec(align(64)) VUINT32 dRsqrtMsk[4][2];
> +    __declspec(align(64)) VUINT32 MOne[4][2];
> +    __declspec(align(64)) VUINT32 HalfMask[4][2];
> +    __declspec(align(64)) VUINT32 Two[4][2];
> +    __declspec(align(64)) VUINT32 sqrt_coeff[4][4][2];
> +    __declspec(align(64)) VUINT32 poly_coeff[12][4][2];
> +    __declspec(align(64)) VUINT32 PiL[4][2];
> +    __declspec(align(64)) VUINT32 PiH[4][2];
> +    __declspec(align(64)) VUINT32 Pi2L[4][2];
> +    __declspec(align(64)) VUINT32 Pi2H[4][2];
> +    __declspec(align(64)) VUINT32 Zero[4][2];
> +    __declspec(align(64)) VUINT32 SgnMask[4][2];
> +    __declspec(align(64)) VUINT32 NanMask[4][2];
> +    __declspec(align(64)) VUINT32 ep_coeff[6][4][2];
> +    /* scalar part follow */
> +    __declspec(align(64)) VUINT32 dInfs[2][2];
> +    __declspec(align(64)) VUINT32 dOnes[2][2];
> +    __declspec(align(64)) VUINT32 dZeros[2][2];
> +} __svml_dacos_data_internal_t;
> +#endif
> +__svml_dacos_data_internal:
> +        /*== SgnBit ==*/
> +        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> +        /*== OneHalf ==*/
> +        .align 64
> +        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
> +        /*== SmallNorm ==*/
> +        .align 64
> +        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
> +        /*== dRsqrtMsk ==*/
> +        .align 64
> +        .quad 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000
> +        /*== MOne ==*/
> +        .align 64
> +        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
> +        /*== HalfMask ==*/
> +        .align 64
> +        .quad 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000
> +        /*== Two ==*/
> +        .align 64
> +        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
> +        /*== sqrt_coeff[4] ==*/
> +        .align 64
> +        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> +        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> +        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> +        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> +        /*== poly_coeff[12] ==*/
> +        .align 64
> +        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> +        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> +        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> +        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> +        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> +        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> +        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> +        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> +        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> +        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> +        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> +        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> +        /*== PiL ==*/
> +        .align 64
> +        .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07
> +        /*== PiH ==*/
> +        .align 64
> +        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
> +        /*== Pi2L ==*/
> +        .align 64
> +        .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
> +        /*== Pi2H ==*/
> +        .align 64
> +        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
> +        /*== Zero ==*/
> +        .align 64
> +        .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
> +        /*== sgn mask ==*/
> +        .align 64
> +        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> +        /*== NaN mask ==*/
> +        .align 64
> +        .quad 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000
> +        /*== ep_coeff[6] ==*/
> +        .align 64
> +        .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
> +        .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
> +        .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
> +        .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
> +        .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
> +        .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
> +        /* scalar part follows */
> +        /*== dInfs = DP infinity, +/- ==*/
> +        .align 64
> +        .quad 0x7ff0000000000000, 0xfff0000000000000
> +        /*== dOnes = DP one, +/- ==*/
> +        .align 64
> +        .quad 0x3ff0000000000000, 0xbff0000000000000
> +        /*== dZeros = DP zero +/- ==*/
> +        .align 64
> +        .quad 0x0000000000000000, 0x8000000000000000
> +        .align 64
> +        .type  __svml_dacos_data_internal,@object
> +        .size  __svml_dacos_data_internal,1792
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
> new file mode 100644
> index 0000000000..4d64fd1c00
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
> @@ -0,0 +1,20 @@
> +/* AVX2 version of vectorized acos, vector length is 8.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper
> +#include "../svml_d_acos8_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
> new file mode 100644
> index 0000000000..1e7d1865fb
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 8.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVeN8v_acos
> +#include "ifunc-mathvec-avx512-skx.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> new file mode 100644
> index 0000000000..52832893ec
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> @@ -0,0 +1,386 @@
> +/* Function acos vectorized with AVX-512.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define _SgnBit                        0
> +#define _OneHalf                       64
> +#define _SmallNorm                     128
> +#define _dRsqrtMsk                     192
> +#define _MOne                          256
> +#define _HalfMask                      320
> +#define _Two                           384
> +#define _sqrt_coeff_1                  448
> +#define _sqrt_coeff_2                  512
> +#define _sqrt_coeff_3                  576
> +#define _sqrt_coeff_4                  640
> +#define _poly_coeff_1                  704
> +#define _poly_coeff_2                  768
> +#define _poly_coeff_3                  832
> +#define _poly_coeff_4                  896
> +#define _poly_coeff_5                  960
> +#define _poly_coeff_6                  1024
> +#define _poly_coeff_7                  1088
> +#define _poly_coeff_8                  1152
> +#define _poly_coeff_9                  1216
> +#define _poly_coeff_10                 1280
> +#define _poly_coeff_11                 1344
> +#define _poly_coeff_12                 1408
> +#define _PiL                           1472
> +#define _PiH                           1536
> +#define _Pi2L                          1600
> +#define _Pi2H                          1664
> +#define _Zero                          1728
> +#define _SgnMask                       1792
> +#define _NanMask                       1856
> +#define _ep_coeff_1                    1920
> +#define _ep_coeff_2                    1984
> +#define _ep_coeff_3                    2048
> +#define _ep_coeff_4                    2112
> +#define _ep_coeff_5                    2176
> +#define _ep_coeff_6                    2240
> +#define _dInfs                         2304
> +#define _dOnes                         2368
> +#define _dZeros                        2432
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.evex512,"ax",@progbits
> +ENTRY(_ZGVeN8v_acos_skx)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-64, %rsp
> +        subq      $256, %rsp
> +        vmovups   __svml_dacos_data_internal(%rip), %zmm7
> +        vmovups   _OneHalf+__svml_dacos_data_internal(%rip), %zmm8
> +
> +/* S ~ 2*sqrt(Y) */
> +        vmovups   _SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
> +        vmovups   _Two+__svml_dacos_data_internal(%rip), %zmm14
> +        vmovups   _sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
> +        vmovups   _sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
> +        vmovups   _sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
> +        vmovups   _MOne+__svml_dacos_data_internal(%rip), %zmm10
> +        vmovaps   %zmm0, %zmm6
> +
> +/* x = -|arg| */
> +        vorpd     %zmm6, %zmm7, %zmm5
> +        vandpd    %zmm6, %zmm7, %zmm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
> +
> +/* x^2 */
> +        vmulpd    {rn-sae}, %zmm5, %zmm5, %zmm9
> +        vrsqrt14pd %zmm8, %zmm12
> +        vcmppd    $17, {sae}, %zmm11, %zmm8, %k2
> +        vcmppd    $17, {sae}, %zmm10, %zmm5, %k0
> +        vmovups   _poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
> +        vmovups   _poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
> +        vminpd    {sae}, %zmm8, %zmm9, %zmm3
> +        vmovups   _poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
> +        vxorpd    %zmm12, %zmm12, %zmm12{%k2}
> +        vaddpd    {rn-sae}, %zmm8, %zmm8, %zmm0
> +        vcmppd    $21, {sae}, %zmm8, %zmm3, %k1
> +
> +/* X<X^2 iff X<0 */
> +        vcmppd    $17, {sae}, %zmm3, %zmm6, %k3
> +        vmulpd    {rn-sae}, %zmm12, %zmm12, %zmm13
> +        vmulpd    {rn-sae}, %zmm12, %zmm0, %zmm7
> +        vmovups   _poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
> +
> +/* polynomial */
> +        vmovups   _poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
> +        vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
> +        vmovups   _sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
> +        vmovups   _poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
> +        vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
> +        vmovups   _poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
> +        vmulpd    {rn-sae}, %zmm0, %zmm7, %zmm14
> +        vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
> +        vmovups   _poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
> +        kmovw     %k1, %eax
> +        kmovw     %k3, %ecx
> +        kmovw     %k0, %edx
> +        vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
> +        vmovups   _poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
> +        vmulpd    {rn-sae}, %zmm3, %zmm3, %zmm0
> +        vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
> +        vmovups   _poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
> +        vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
> +        vblendmpd %zmm2, %zmm5, %zmm2{%k1}
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
> +        vmovups   _poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> +        andl      %eax, %ecx
> +        vmovups   _poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> +        kmovw     %ecx, %k2
> +        vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
> +        vmulpd    {rn-sae}, %zmm0, %zmm0, %zmm10
> +        vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
> +        vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
> +        vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
> +        vmovups   _Pi2H+__svml_dacos_data_internal(%rip), %zmm0
> +        vmulpd    {rn-sae}, %zmm3, %zmm1, %zmm1
> +        vxorpd    %zmm4, %zmm2, %zmm3
> +        vxorpd    %zmm0, %zmm0, %zmm0{%k1}
> +        vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
> +        vorpd     _PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k2}
> +        vaddpd    {rn-sae}, %zmm1, %zmm0, %zmm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        vmovups   %zmm6, 128(%rsp)
> +        vmovups   %zmm0, 192(%rsp)
> +
> +/* Go to exit */
> +        je        L(EXIT)
> +        xorl      %eax, %eax
> +        vzeroupper
> +        kmovw     %k4, 24(%rsp)
> +        kmovw     %k5, 16(%rsp)
> +        kmovw     %k6, 8(%rsp)
> +        kmovw     %k7, (%rsp)
> +        movq      %rsi, 40(%rsp)
> +        movq      %rdi, 32(%rsp)
> +        movq      %r12, 64(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 56(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, 48(%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $8, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        kmovw     24(%rsp), %k4
> +        cfi_restore(122)
> +        kmovw     16(%rsp), %k5
> +        cfi_restore(123)
> +        kmovw     8(%rsp), %k6
> +        cfi_restore(124)
> +        kmovw     (%rsp), %k7
> +        cfi_restore(125)
> +        vmovups   192(%rsp), %zmm0
> +        movq      40(%rsp), %rsi
> +        cfi_restore(4)
> +        movq      32(%rsp), %rdi
> +        cfi_restore(5)
> +        movq      64(%rsp), %r12
> +        cfi_restore(12)
> +        movq      56(%rsp), %r13
> +        cfi_restore(13)
> +        movq      48(%rsp), %r14
> +        cfi_restore(14)
> +
> +/* Go to exit */
> +        jmp       L(EXIT)

Is there a control flow from L(SPECIAL_VALUES_BRANCH) that doesn't go to exit?
If not I think alot of the save/restores are pretty unnecissary.

> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movsd     128(%rsp,%r14,8), %xmm0
> +        call      acos@PLT
> +        movsd     %xmm0, 192(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVeN8v_acos_skx)
> +
> +        .section .rodata, "a"
> +        .align 64
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +    __declspec(align(64)) VUINT32 SgnBit[8][2];
> +    __declspec(align(64)) VUINT32 OneHalf[8][2];
> +    __declspec(align(64)) VUINT32 SmallNorm[8][2];
> +    __declspec(align(64)) VUINT32 dRsqrtMsk[8][2];
> +    __declspec(align(64)) VUINT32 MOne[8][2];
> +    __declspec(align(64)) VUINT32 HalfMask[8][2];
> +    __declspec(align(64)) VUINT32 Two[8][2];
> +    __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
> +    __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
> +    __declspec(align(64)) VUINT32 PiL[8][2];
> +    __declspec(align(64)) VUINT32 PiH[8][2];
> +    __declspec(align(64)) VUINT32 Pi2L[8][2];
> +    __declspec(align(64)) VUINT32 Pi2H[8][2];
> +    __declspec(align(64)) VUINT32 Zero[8][2];
> +    __declspec(align(64)) VUINT32 SgnMask[8][2];
> +    __declspec(align(64)) VUINT32 NanMask[8][2];
> +    __declspec(align(64)) VUINT32 ep_coeff[6][8][2];
> +    /* scalar part follow */
> +    __declspec(align(64)) VUINT32 dInfs[2][2];
> +    __declspec(align(64)) VUINT32 dOnes[2][2];
> +    __declspec(align(64)) VUINT32 dZeros[2][2];
> +} __svml_dacos_data_internal_t;
> +#endif
> +__svml_dacos_data_internal:
> +        /*== SgnBit ==*/
> +        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> +        /*== OneHalf ==*/
> +        .align 64
> +        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
> +        /*== SmallNorm ==*/
> +        .align 64
> +        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
> +        /*== dRsqrtMsk ==*/
> +        .align 64
> +        .quad 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000
> +        /*== MOne ==*/
> +        .align 64
> +        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
> +        /*== HalfMask ==*/
> +        .align 64
> +        .quad 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000
> +        /*== Two ==*/
> +        .align 64
> +        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
> +        /*== sqrt_coeff[4] ==*/
> +        .align 64
> +        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> +        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> +        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> +        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> +        /*== poly_coeff[12] ==*/
> +        .align 64
> +        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> +        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> +        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> +        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> +        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> +        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> +        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> +        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> +        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> +        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> +        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> +        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> +        /*== PiL ==*/
> +        .align 64
> +        .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07
> +        /*== PiH ==*/
> +        .align 64
> +        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
> +        /*== Pi2L ==*/
> +        .align 64
> +        .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
> +        /*== Pi2H ==*/
> +        .align 64
> +        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
> +        /*== Zero ==*/
> +        .align 64
> +        .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
> +        /*== sgn mask ==*/
> +        .align 64
> +        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> +        /*== NaN mask ==*/
> +        .align 64
> +        .quad 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000
> +        /*== ep_coeff[6] ==*/
> +        .align 64
> +        .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
> +        .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
> +        .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
> +        .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
> +        .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
> +        .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
> +        /* scalar part follows */
> +        /*== dInfs = DP infinity, +/- ==*/
> +        .align 64
> +        .quad 0x7ff0000000000000, 0xfff0000000000000
> +        /*== dOnes = DP one, +/- ==*/
> +        .align 64
> +        .quad 0x3ff0000000000000, 0xbff0000000000000
> +        /*== dZeros = DP zero +/- ==*/
> +        .align 64
> +        .quad 0x0000000000000000, 0x8000000000000000
> +        .align 64
> +        .type  __svml_dacos_data_internal,@object
> +        .size  __svml_dacos_data_internal,2496
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
> new file mode 100644
> index 0000000000..1ff0cfc8d5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
> @@ -0,0 +1,20 @@
> +/* AVX2 version of vectorized acosf.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper
> +#include "../svml_s_acosf16_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
> new file mode 100644
> index 0000000000..fcf05782c5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 16.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVeN16v_acosf
> +#include "ifunc-mathvec-avx512-skx.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf,
> +              __redirect__ZGVeN16v_acosf)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> new file mode 100644
> index 0000000000..d30b04a607
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> @@ -0,0 +1,332 @@
> +/* Function acosf vectorized with AVX-512.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define _SgnBit                        0
> +#define _OneHalf                       64
> +#define _sRsqrtMsk                     128
> +#define _SmallNorm                     192
> +#define _MOne                          256
> +#define _HalfMask                      320
> +#define _SQMask                        384
> +#define _Two                           448
> +#define _sqrt_coeff_1                  512
> +#define _sqrt_coeff_2                  576
> +#define _poly_coeff_1                  640
> +#define _poly_coeff_2                  704
> +#define _poly_coeff_3                  768
> +#define _poly_coeff_4                  832
> +#define _poly_coeff_5                  896
> +#define _Pi2H                          960
> +#define _Pi2L                          1024
> +#define _PiH                           1088
> +#define _PiL                           1152
> +#define _Zero                          1216
> +#define _SgnMask                       1280
> +#define _NanMask                       1344
> +#define _ep_coeff_1                    1408
> +#define _ep_coeff_2                    1472
> +#define _ep_coeff_3                    1536
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.exex512,"ax",@progbits
> +ENTRY(_ZGVeN16v_acosf_skx)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-64, %rsp
> +        subq      $256, %rsp
> +        vmovups   __svml_sacos_data_internal(%rip), %zmm5
> +        vmovups   _OneHalf+__svml_sacos_data_internal(%rip), %zmm6
> +
> +/* SQ ~ 2*sqrt(Y) */
> +        vmovups   _SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
> +        vmovups   _MOne+__svml_sacos_data_internal(%rip), %zmm8
> +        vmovups   _Two+__svml_sacos_data_internal(%rip), %zmm12
> +        vmovups   _sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
> +        vmovaps   %zmm0, %zmm4
> +
> +/* x = -|arg| */
> +        vorps     %zmm4, %zmm5, %zmm3
> +        vandps    %zmm4, %zmm5, %zmm2
> +        vmovups   _sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
> +
> +/* x^2 */
> +        vmulps    {rn-sae}, %zmm3, %zmm3, %zmm7
> +        vrsqrt14ps %zmm6, %zmm10
> +        vcmpps    $17, {sae}, %zmm9, %zmm6, %k2
> +        vcmpps    $22, {sae}, %zmm3, %zmm8, %k0
> +        vmovups   _poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
> +        vminps    {sae}, %zmm6, %zmm7, %zmm1
> +        vmovups   _poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
> +        vxorps    %zmm10, %zmm10, %zmm10{%k2}
> +        vaddps    {rn-sae}, %zmm6, %zmm6, %zmm14
> +        vmulps    {rn-sae}, %zmm1, %zmm1, %zmm8
> +        vmulps    {rn-sae}, %zmm10, %zmm10, %zmm11
> +        vmulps    {rn-sae}, %zmm10, %zmm14, %zmm5
> +        vcmpps    $21, {sae}, %zmm6, %zmm1, %k1
> +
> +/* X<X^2 iff X<0 */
> +        vcmpps    $17, {sae}, %zmm1, %zmm4, %k3
> +
> +/* polynomial */
> +        vmovups   _poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
> +        vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
> +        vmovups   _poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
> +        vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
> +        vmovups   _poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
> +        vmovups   _Pi2H+__svml_sacos_data_internal(%rip), %zmm12
> +        vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
> +        vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
> +        vmulps    {rn-sae}, %zmm14, %zmm5, %zmm15
> +        vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
> +        vxorps    %zmm12, %zmm12, %zmm12{%k1}
> +        vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
> +        vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
> +        kmovw     %k1, %eax
> +        kmovw     %k3, %ecx
> +        kmovw     %k0, %edx
> +        vmulps    {rn-sae}, %zmm1, %zmm11, %zmm13
> +        vblendmps %zmm0, %zmm3, %zmm0{%k1}
> +        vxorps    %zmm2, %zmm0, %zmm1
> +        andl      %eax, %ecx
> +        kmovw     %ecx, %k2
> +        vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
> +        vorps     _PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k2}
> +        vaddps    {rn-sae}, %zmm13, %zmm12, %zmm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        vmovups   %zmm4, 128(%rsp)
> +        vmovups   %zmm0, 192(%rsp)
> +
> +/* Go to exit */
> +        je        L(EXIT)
> +        xorl      %eax, %eax
> +        vzeroupper
> +        kmovw     %k4, 24(%rsp)
> +        kmovw     %k5, 16(%rsp)
> +        kmovw     %k6, 8(%rsp)
> +        kmovw     %k7, (%rsp)
> +        movq      %rsi, 40(%rsp)
> +        movq      %rdi, 32(%rsp)
> +        movq      %r12, 64(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 56(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, 48(%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $16, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        kmovw     24(%rsp), %k4
> +        cfi_restore(122)
> +        kmovw     16(%rsp), %k5
> +        cfi_restore(123)
> +        kmovw     8(%rsp), %k6
> +        cfi_restore(124)
> +        kmovw     (%rsp), %k7
> +        cfi_restore(125)
> +        vmovups   192(%rsp), %zmm0
> +        movq      40(%rsp), %rsi
> +        cfi_restore(4)
> +        movq      32(%rsp), %rdi
> +        cfi_restore(5)
> +        movq      64(%rsp), %r12
> +        cfi_restore(12)
> +        movq      56(%rsp), %r13
> +        cfi_restore(13)
> +        movq      48(%rsp), %r14
> +        cfi_restore(14)
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movss     128(%rsp,%r14,4), %xmm0
> +        call      acosf@PLT
> +        movss     %xmm0, 192(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVeN16v_acosf_skx)
> +
> +        .section .rodata, "a"
> +        .align 64
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +    __declspec(align(64)) VUINT32 SgnBit[16][1];
> +    __declspec(align(64)) VUINT32 OneHalf[16][1];
> +    __declspec(align(64)) VUINT32 sRsqrtMsk[16][1];
> +    __declspec(align(64)) VUINT32 SmallNorm[16][1];
> +    __declspec(align(64)) VUINT32 MOne[16][1];
> +    __declspec(align(64)) VUINT32 HalfMask[16][1];
> +    __declspec(align(64)) VUINT32 SQMask[16][1];
> +    __declspec(align(64)) VUINT32 Two[16][1];
> +    __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
> +    __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
> +    __declspec(align(64)) VUINT32 Pi2H[16][1];
> +    __declspec(align(64)) VUINT32 Pi2L[16][1];
> +    __declspec(align(64)) VUINT32 PiH[16][1];
> +    __declspec(align(64)) VUINT32 PiL[16][1];
> +    __declspec(align(64)) VUINT32 Zero[16][1];
> +    __declspec(align(64)) VUINT32 SgnMask[16][1];
> +    __declspec(align(64)) VUINT32 NanMask[16][1];
> +    __declspec(align(64)) VUINT32 ep_coeff[3][16][1];
> +} __svml_sacos_data_internal_t;
> +#endif
> +__svml_sacos_data_internal:
> +        /*== SgnBit ==*/
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> +        /*== OneHalf ==*/
> +        .align 64
> +        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> +        /*== sRsqrtMsk ==*/
> +        .align 64
> +        .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
> +        /*== SmallNorm ==*/
> +        .align 64
> +        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> +        /*== MOne ==*/
> +        .align 64
> +        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> +        /*== HalfMask ==*/
> +        .align 64
> +        .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
> +        /*== SQMask ==*/
> +        .align 64
> +        .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
> +        /*== Two ==*/
> +        .align 64
> +        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
> +        /*== sqrt_coeff[2] ==*/
> +        .align 64
> +        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> +        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> +        /*== poly_coeff[5] ==*/
> +        .align 64
> +        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> +        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> +        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> +        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> +        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> +        /*== Pi2H ==*/
> +        .align 64
> +        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> +        /*== Pi2L ==*/
> +        .align 64
> +        .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
> +        /*== PiH ==*/
> +        .align 64
> +        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> +        /*== PiL ==*/
> +        .align 64
> +        .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
> +        /*== zero ==*/
> +        .align 64
> +        .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
> +        /*== sgn mask ==*/
> +        .align 64
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> +        /*== nan mask ==*/
> +        .align 64
> +        .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
> +        /*== ep_coeff[3] ==*/
> +        .align 64
> +        .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
> +        .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
> +        .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
> +        .align 64
> +        .type  __svml_sacos_data_internal,@object
> +        .size  __svml_sacos_data_internal,1600
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
> new file mode 100644
> index 0000000000..f94b3eb01a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
> @@ -0,0 +1,20 @@
> +/* SSE2 version of vectorized acosf, vector length is 4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2
> +#include "../svml_s_acosf4_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
> new file mode 100644
> index 0000000000..6f9a5c1082
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVbN4v_acosf
> +#include "ifunc-mathvec-sse4_1.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf,
> +              __redirect__ZGVbN4v_acosf)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
> new file mode 100644
> index 0000000000..acfdc348aa
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
> @@ -0,0 +1,351 @@
> +/* Function acosf vectorized with SSE4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define _SgnBit                        0
> +#define _OneHalf                       64
> +#define _sRsqrtMsk                     128
> +#define _SmallNorm                     192
> +#define _MOne                          256
> +#define _HalfMask                      320
> +#define _SQMask                        384
> +#define _Two                           448
> +#define _sqrt_coeff                    512
> +#define _poly_coeff                    576
> +#define _Pi2H                          704
> +#define _Pi2L                          768
> +#define _PiH                           832
> +#define _PiL                           896
> +#define _Zero                          960
> +#define _SgnMask                       1024
> +#define _NanMask                       1088
> +#define _ep_coeff                      1152
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.sse4,"ax",@progbits
> +ENTRY(_ZGVbN4v_acosf_sse4)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-64, %rsp
> +        subq      $320, %rsp
> +        movaps    %xmm0, %xmm6
> +
> +/*
> + * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
> + * SQ ~ 2*sqrt(X)
> + */
> +        movups    __svml_sacos_data_internal(%rip), %xmm5
> +        movups    _OneHalf+__svml_sacos_data_internal(%rip), %xmm0
> +
> +/* x = -|arg| */
> +        movaps    %xmm5, %xmm7
> +        orps      %xmm6, %xmm7
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        movaps    %xmm0, %xmm2
> +        mulps     %xmm7, %xmm2
> +
> +/* x^2 */
> +        movaps    %xmm7, %xmm3
> +        mulps     %xmm7, %xmm3
> +        addps     %xmm2, %xmm0
> +        movups    _MOne+__svml_sacos_data_internal(%rip), %xmm4
> +        andps     %xmm6, %xmm5
> +        cmpnleps  %xmm7, %xmm4
> +        minps     %xmm0, %xmm3
> +
> +/* SQ ~ 2*sqrt(Y) */
> +        rsqrtps   %xmm0, %xmm1
> +        movmskps  %xmm4, %edx
> +        movaps    %xmm0, %xmm4
> +        movaps    %xmm3, %xmm2
> +        movups    %xmm8, 160(%rsp)
> +        cmpltps   _SmallNorm+__svml_sacos_data_internal(%rip), %xmm4
> +        cmpnltps  %xmm0, %xmm2
> +        addps     %xmm0, %xmm0
> +        andnps    %xmm1, %xmm4
> +        movaps    %xmm4, %xmm8
> +        mulps     %xmm4, %xmm8
> +        mulps     %xmm0, %xmm4
> +        mulps     %xmm8, %xmm0
> +        movups    _sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm1
> +
> +/* polynomial */
> +        movups    _poly_coeff+__svml_sacos_data_internal(%rip), %xmm8
> +        mulps     %xmm3, %xmm8
> +        subps     _Two+__svml_sacos_data_internal(%rip), %xmm0
> +        mulps     %xmm0, %xmm1
> +        addps     _poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm8
> +        mulps     %xmm4, %xmm0
> +        addps     _sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm1
> +        mulps     %xmm0, %xmm1
> +        movaps    %xmm3, %xmm0
> +        mulps     %xmm3, %xmm0
> +        subps     %xmm1, %xmm4
> +        mulps     %xmm0, %xmm8
> +        movups    _poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm1
> +        andps     %xmm2, %xmm4
> +        mulps     %xmm3, %xmm1
> +        movups    _PiH+__svml_sacos_data_internal(%rip), %xmm0
> +        andps     %xmm2, %xmm0
> +        addps     _poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm1
> +        addps     %xmm8, %xmm1
> +
> +/* X<X^2 iff X<0 */
> +        movaps    %xmm6, %xmm8
> +        cmpltps   %xmm3, %xmm8
> +        mulps     %xmm3, %xmm1
> +        andps     %xmm8, %xmm0
> +        movaps    %xmm2, %xmm8
> +        andnps    %xmm7, %xmm8
> +        addps     _poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm1
> +        mulps     %xmm3, %xmm1
> +        orps      %xmm4, %xmm8
> +        pxor      %xmm5, %xmm8
> +        movaps    %xmm2, %xmm3
> +        mulps     %xmm8, %xmm1
> +        andnps    _Pi2H+__svml_sacos_data_internal(%rip), %xmm3
> +        addps     %xmm1, %xmm8
> +        addps     %xmm3, %xmm0
> +        addps     %xmm8, %xmm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movups    160(%rsp), %xmm8
> +        cfi_restore(25)
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        movups    %xmm6, 192(%rsp)
> +        movups    %xmm0, 256(%rsp)
> +        xorl      %eax, %eax
> +        movups    %xmm9, 96(%rsp)
> +        movups    %xmm10, 80(%rsp)
> +        movups    %xmm11, 64(%rsp)
> +        movups    %xmm12, 48(%rsp)
> +        movups    %xmm13, 32(%rsp)
> +        movups    %xmm14, 16(%rsp)
> +        movups    %xmm15, (%rsp)
> +        movq      %rsi, 120(%rsp)
> +        movq      %rdi, 112(%rsp)
> +        movq      %r12, 144(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 136(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, 128(%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $4, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movups    96(%rsp), %xmm9
> +        cfi_restore(26)
> +        movups    80(%rsp), %xmm10
> +        cfi_restore(27)
> +        movups    64(%rsp), %xmm11
> +        cfi_restore(28)
> +        movups    48(%rsp), %xmm12
> +        cfi_restore(29)
> +        movups    32(%rsp), %xmm13
> +        cfi_restore(30)
> +        movups    16(%rsp), %xmm14

> +        cfi_restore(31)
> +        movups    (%rsp), %xmm15
> +        cfi_restore(32)
> +        movq      120(%rsp), %rsi
> +        cfi_restore(4)
> +        movq      112(%rsp), %rdi
> +        cfi_restore(5)
> +        movq      144(%rsp), %r12
> +        cfi_restore(12)
> +        movq      136(%rsp), %r13
> +        cfi_restore(13)
> +        movq      128(%rsp), %r14
> +        cfi_restore(14)
> +        movups    256(%rsp), %xmm0
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movss     192(%rsp,%r14,4), %xmm0
> +        call      acosf@PLT
> +        movss     %xmm0, 256(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVbN4v_acosf_sse4)
> +
> +        .section .rodata, "a"
> +        .align 64
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +    __declspec(align(64)) VUINT32 SgnBit[4][1];
> +    __declspec(align(64)) VUINT32 OneHalf[4][1];
> +    __declspec(align(64)) VUINT32 sRsqrtMsk[4][1];
> +    __declspec(align(64)) VUINT32 SmallNorm[4][1];
> +    __declspec(align(64)) VUINT32 MOne[4][1];
> +    __declspec(align(64)) VUINT32 HalfMask[4][1];
> +    __declspec(align(64)) VUINT32 SQMask[4][1];
> +    __declspec(align(64)) VUINT32 Two[4][1];
> +    __declspec(align(64)) VUINT32 sqrt_coeff[2][4][1];
> +    __declspec(align(64)) VUINT32 poly_coeff[5][4][1];
> +    __declspec(align(64)) VUINT32 Pi2H[4][1];
> +    __declspec(align(64)) VUINT32 Pi2L[4][1];
> +    __declspec(align(64)) VUINT32 PiH[4][1];
> +    __declspec(align(64)) VUINT32 PiL[4][1];
> +    __declspec(align(64)) VUINT32 Zero[4][1];
> +    __declspec(align(64)) VUINT32 SgnMask[4][1];
> +    __declspec(align(64)) VUINT32 NanMask[4][1];
> +    __declspec(align(64)) VUINT32 ep_coeff[3][4][1];
> +} __svml_sacos_data_internal_t;
> +#endif
> +__svml_sacos_data_internal:

Can the xmm/ymm/zmm versions a functions share the same internal data table?


> +        /*== SgnBit ==*/
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
> +        /*== OneHalf ==*/
> +        .align 64

I think .align 32 here?

> +        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> +        /*== sRsqrtMsk ==*/
> +        .align 64
> +        .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
> +        /*== SmallNorm ==*/
> +        .align 64
> +        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> +        /*== MOne ==*/
> +        .align 64
> +        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> +        /*== HalfMask ==*/
> +        .align 64
> +        .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
> +        /*== SQMask ==*/
> +        .align 64
> +        .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
> +        /*== Two ==*/
> +        .align 64
> +        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
> +        /*== sqrt_coeff[2] ==*/
> +        .align 64
> +        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> +        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> +        /*== poly_coeff[5] ==*/
> +        .align 64
> +        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> +        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> +        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> +        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> +        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> +        /*== Pi2H ==*/
> +        .align 64
> +        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> +        /*== Pi2L ==*/
> +        .align 64
> +        .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
> +        /*== PiH ==*/
> +        .align 64
> +        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> +        /*== PiL ==*/
> +        .align 64
> +        .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
> +        /*== zero ==*/
> +        .align 64
> +        .long 0x00000000, 0x00000000, 0x00000000, 0x00000000
> +        /*== sgn mask ==*/
> +        .align 64
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
> +        /*== nan mask ==*/
> +        .align 64
> +        .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
> +        /*== ep_coeff[3] ==*/
> +        .align 64
> +        .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
> +        .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
> +        .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
> +        .align 64
> +        .type  __svml_sacos_data_internal,@object
> +        .size  __svml_sacos_data_internal,1216
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
> new file mode 100644
> index 0000000000..583ef54fee
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
> @@ -0,0 +1,20 @@
> +/* SSE version of vectorized acosf, vector length is 8.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +    Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper
> +#include "../svml_s_acosf8_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
> new file mode 100644
> index 0000000000..dd360a9479
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 8.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVdN8v_acosf
> +#include "ifunc-mathvec-avx2.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf,
> +              __redirect__ZGVdN8v_acosf)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> new file mode 100644
> index 0000000000..6d800f9aa4
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> @@ -0,0 +1,332 @@
> +/* Function acosf vectorized with AVX2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define _SgnBit                        0
> +#define _OneHalf                       64
> +#define _sRsqrtMsk                     128
> +#define _SmallNorm                     192
> +#define _MOne                          256
> +#define _HalfMask                      320
> +#define _SQMask                        384
> +#define _Two                           448
> +#define _sqrt_coeff                    512
> +#define _poly_coeff                    576
> +#define _Pi2H                          768
> +#define _Pi2L                          832
> +#define _PiH                           896
> +#define _PiL                           960
> +#define _Zero                          1024
> +#define _SgnMask                       1088
> +#define _NanMask                       1152
> +#define _ep_coeff                      1216
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.avx2,"ax",@progbits
> +ENTRY(_ZGVdN8v_acosf_avx2)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-64, %rsp
> +        subq      $384, %rsp
> +
> +/*
> + * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
> + * SQ ~ 2*sqrt(X)
> + */
> +        vmovups   __svml_sacos_data_internal(%rip), %ymm6
> +        vmovups   _OneHalf+__svml_sacos_data_internal(%rip), %ymm7
> +        vmovups   %ymm8, 288(%rsp)
> +        vmovups   %ymm15, 352(%rsp)
> +        vmovups   %ymm9, 96(%rsp)
> +        vmovups   _poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm15
> +        vmovups   %ymm10, 160(%rsp)
> +        vmovups   %ymm11, 192(%rsp)
> +        vmovups   %ymm14, 320(%rsp)
> +        vmovups   %ymm13, 256(%rsp)
> +        vmovups   %ymm12, 224(%rsp)
> +        vmovaps   %ymm0, %ymm5
> +
> +/* x = -|arg| */
> +        vorps     %ymm5, %ymm6, %ymm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        vfmadd231ps %ymm4, %ymm7, %ymm7
> +
> +/* x^2 */
> +        vmulps    %ymm4, %ymm4, %ymm1
> +        vcmpnge_uqps _MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm0
> +
> +/* SQ ~ 2*sqrt(Y) */
> +        vaddps    %ymm7, %ymm7, %ymm11
> +        vminps    %ymm7, %ymm1, %ymm2
> +        vrsqrtps  %ymm7, %ymm8
> +        vfmadd213ps _poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm15
> +        vmulps    %ymm2, %ymm2, %ymm14
> +        vcmpnlt_uqps %ymm7, %ymm2, %ymm1
> +        vandps    %ymm5, %ymm6, %ymm3
> +        vcmplt_oqps _SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm6
> +        vandnps   %ymm8, %ymm6, %ymm9
> +
> +/* polynomial */
> +        vmovups   _poly_coeff+__svml_sacos_data_internal(%rip), %ymm6
> +        vmulps    %ymm9, %ymm9, %ymm10
> +        vmulps    %ymm11, %ymm9, %ymm13
> +        vfmadd213ps _poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm6
> +        vfmsub213ps _Two+__svml_sacos_data_internal(%rip), %ymm10, %ymm11
> +        vfmadd213ps %ymm15, %ymm14, %ymm6
> +        vmulps    %ymm11, %ymm13, %ymm12
> +        vfmadd213ps _poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm6
> +        vmovmskps %ymm0, %edx
> +        vmovups   _sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
> +        vmulps    %ymm6, %ymm2, %ymm9
> +
> +/* X<X^2 iff X<0 */
> +        vcmplt_oqps %ymm2, %ymm5, %ymm6
> +        vfmadd213ps _sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm11, %ymm0
> +        vfnmadd213ps %ymm13, %ymm12, %ymm0
> +        vblendvps %ymm1, %ymm0, %ymm4, %ymm4
> +        vxorps    %ymm3, %ymm4, %ymm3
> +        vandps    _PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
> +        vfmadd213ps %ymm3, %ymm3, %ymm9
> +        vandps    %ymm6, %ymm2, %ymm2
> +        vandnps   _Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm7
> +        vaddps    %ymm7, %ymm2, %ymm8
> +        vaddps    %ymm9, %ymm8, %ymm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        vmovups   288(%rsp), %ymm8
> +        cfi_restore(91)
> +        vmovups   96(%rsp), %ymm9
> +        cfi_restore(92)
> +        vmovups   160(%rsp), %ymm10
> +        cfi_restore(93)
> +        vmovups   192(%rsp), %ymm11
> +        cfi_restore(94)
> +        vmovups   224(%rsp), %ymm12
> +        cfi_restore(95)
> +        vmovups   256(%rsp), %ymm13
> +        cfi_restore(96)
> +        vmovups   320(%rsp), %ymm14
> +        cfi_restore(97)
> +        vmovups   352(%rsp), %ymm15
> +        cfi_restore(98)
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        vmovups   %ymm5, 64(%rsp)
> +        vmovups   %ymm0, 128(%rsp)
> +
> +/* Go to exit */
> +        je        L(EXIT)
> +        xorl      %eax, %eax
> +        vzeroupper
> +        movq      %rsi, 8(%rsp)
> +        movq      %rdi, (%rsp)
> +        movq      %r12, 32(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 24(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, 16(%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $8, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movq      8(%rsp), %rsi
> +        cfi_restore(4)
> +        movq      (%rsp), %rdi
> +        cfi_restore(5)
> +        movq      32(%rsp), %r12
> +        cfi_restore(12)
> +        movq      24(%rsp), %r13
> +        cfi_restore(13)
> +        movq      16(%rsp), %r14
> +        cfi_restore(14)
> +        vmovups   128(%rsp), %ymm0
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movss     64(%rsp,%r14,4), %xmm0
> +        call      acosf@PLT
> +        movss     %xmm0, 128(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVdN8v_acosf_avx2)
> +
> +        .section .rodata, "a"
> +        .align 64
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +    __declspec(align(64)) VUINT32 SgnBit[8][1];
> +    __declspec(align(64)) VUINT32 OneHalf[8][1];
> +    __declspec(align(64)) VUINT32 sRsqrtMsk[8][1];
> +    __declspec(align(64)) VUINT32 SmallNorm[8][1];
> +    __declspec(align(64)) VUINT32 MOne[8][1];
> +    __declspec(align(64)) VUINT32 HalfMask[8][1];
> +    __declspec(align(64)) VUINT32 SQMask[8][1];
> +    __declspec(align(64)) VUINT32 Two[8][1];
> +    __declspec(align(64)) VUINT32 sqrt_coeff[2][8][1];
> +    __declspec(align(64)) VUINT32 poly_coeff[5][8][1];
> +    __declspec(align(64)) VUINT32 Pi2H[8][1];
> +    __declspec(align(64)) VUINT32 Pi2L[8][1];
> +    __declspec(align(64)) VUINT32 PiH[8][1];
> +    __declspec(align(64)) VUINT32 PiL[8][1];
> +    __declspec(align(64)) VUINT32 Zero[8][1];
> +    __declspec(align(64)) VUINT32 SgnMask[8][1];
> +    __declspec(align(64)) VUINT32 NanMask[8][1];
> +    __declspec(align(64)) VUINT32 ep_coeff[3][8][1];
> +} __svml_sacos_data_internal_t;
> +#endif
> +__svml_sacos_data_internal:
> +        /*== SgnBit ==*/
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000

Might be worth it to generate some of these constants without .rodata.

I.e 0x80000000 can be

vpcmpeqb %ymm0, %ymm0, %ymm0
vpsllq $63, %ymm0, %ymmDST

The same `vpcmpeqb` could be reused for many of these


> +        /*== OneHalf ==*/
> +        .align 64
> +        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> +        /*== sRsqrtMsk ==*/
> +        .align 64
> +        .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
> +        /*== SmallNorm ==*/
> +        .align 64
> +        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> +        /*== MOne ==*/
> +        .align 64
> +        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> +        /*== HalfMask ==*/
> +        .align 64
> +        .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
> +        /*== SQMask ==*/
> +        .align 64
> +        .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
> +        /*== Two ==*/
> +        .align 64
> +        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
> +        /*== sqrt_coeff[2] ==*/
> +        .align 64
> +        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> +        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> +        /*== poly_coeff[5] ==*/
> +        .align 64
> +        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> +        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> +        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> +        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> +        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> +        /*== Pi2H ==*/
> +        .align 64
> +        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> +        /*== Pi2L ==*/
> +        .align 64
> +        .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
> +        /*== PiH ==*/
> +        .align 64
> +        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> +        /*== PiL ==*/
> +        .align 64
> +        .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
> +        /*== zero ==*/
> +        .align 64
> +        .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
> +        /*== sgn mask ==*/
> +        .align 64
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> +        /*== nan mask ==*/
> +        .align 64
> +        .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
> +        /*== ep_coeff[3] ==*/
> +        .align 64
> +        .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
> +        .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
> +        .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
> +        .align 64
> +        .type  __svml_sacos_data_internal,@object
> +        .size  __svml_sacos_data_internal,1344
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos2_core.S b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
> new file mode 100644
> index 0000000000..9656478b2d
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
> @@ -0,0 +1,29 @@
> +/* Function acos vectorized with SSE2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVbN2v_acos)
> +WRAPPER_IMPL_SSE2 acos
> +END (_ZGVbN2v_acos)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVbN2v_acos)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core.S b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
> new file mode 100644
> index 0000000000..e99cb4ae78
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
> @@ -0,0 +1,29 @@
> +/* Function acos vectorized with AVX2, wrapper version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVdN4v_acos)
> +WRAPPER_IMPL_AVX _ZGVbN2v_acos
> +END (_ZGVdN4v_acos)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVdN4v_acos)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
> new file mode 100644
> index 0000000000..7cbcbc965c
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
> @@ -0,0 +1,25 @@
> +/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVcN4v_acos)
> +WRAPPER_IMPL_AVX _ZGVbN2v_acos
> +END (_ZGVcN4v_acos)
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos8_core.S b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
> new file mode 100644
> index 0000000000..e26b30d81a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
> @@ -0,0 +1,25 @@
> +/* Function acos vectorized with AVX-512, wrapper to AVX2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVeN8v_acos)
> +WRAPPER_IMPL_AVX512 _ZGVdN4v_acos
> +END (_ZGVeN8v_acos)
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf16_core.S b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
> new file mode 100644
> index 0000000000..70e046d492
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
> @@ -0,0 +1,25 @@
> +/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVeN16v_acosf)
> +WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf
> +END (_ZGVeN16v_acosf)
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf4_core.S b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
> new file mode 100644
> index 0000000000..36354b32b5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
> @@ -0,0 +1,29 @@
> +/* Function acosf vectorized with SSE2, wrapper version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVbN4v_acosf)
> +WRAPPER_IMPL_SSE2 acosf
> +END (_ZGVbN4v_acosf)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVbN4v_acosf)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
> new file mode 100644
> index 0000000000..f08864a511
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
> @@ -0,0 +1,29 @@
> +/* Function acosf vectorized with AVX2, wrapper version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVdN8v_acosf)
> +WRAPPER_IMPL_AVX _ZGVbN4v_acosf
> +END (_ZGVdN8v_acosf)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVdN8v_acosf)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
> new file mode 100644
> index 0000000000..f3ed4d8e78
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
> @@ -0,0 +1,25 @@
> +/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> +        .text
> +ENTRY (_ZGVcN8v_acosf)
> +WRAPPER_IMPL_AVX _ZGVbN4v_acosf
> +END (_ZGVcN8v_acosf)
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
> new file mode 100644
> index 0000000000..e38b8ce821
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
> @@ -0,0 +1,3 @@
> +#define LIBMVEC_TYPE double
> +#define LIBMVEC_FUNC acos
> +#include "test-vector-abi-arg1.h"
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> index ed932fc98d..0abc7d2021 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
>  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
>
>  #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> index 3a6e37044f..dda093b914 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
>  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
>
>  #ifndef __ILP32__
>  # define VEC_INT_TYPE __m256i
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> index 99db4e7616..f3230463bb 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
>  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
>
>  #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> index 251d429ac0..cf9f52faf0 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
>  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
>
>  #ifndef __ILP32__
>  # define VEC_INT_TYPE __m512i
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
> new file mode 100644
> index 0000000000..fb47f974fd
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
> @@ -0,0 +1,3 @@
> +#define LIBMVEC_TYPE float
> +#define LIBMVEC_FUNC acosf
> +#include "test-vector-abi-arg1.h"
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> index c1d14cd79e..abbd3ed870 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
>  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
>
>  #define VEC_INT_TYPE __m512i
>
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> index d23c372060..8a24027952 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
>  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
>
>  #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> index 3152cffb0c..aff0442606 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
>  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
>
>  /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf.  */
>  #undef VECTOR_WRAPPER_fFF
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> index a8492abfef..913584d111 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
>  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
>
>  #define VEC_INT_TYPE __m128i
>
> --
> 2.31.1
>
Florian Weimer Dec. 15, 2021, 7:57 p.m. UTC | #2
* Noah Goldstein via Libc-alpha:

> Is there a control flow from L(SPECIAL_VALUES_BRANCH) that doesn't go to exit?
> If not I think alot of the save/restores are pretty unnecissary.

It depends on the vector calling convention.  It seems to be different
from the regular psABI calling convention, even for GPRs.

Thanks,
Florian
Noah Goldstein Dec. 15, 2021, 8:32 p.m. UTC | #3
On Wed, Dec 15, 2021 at 1:57 PM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein via Libc-alpha:
>
> > Is there a control flow from L(SPECIAL_VALUES_BRANCH) that doesn't go to exit?
> > If not I think alot of the save/restores are pretty unnecissary.
>
> It depends on the vector calling convention.  It seems to be different
> from the regular psABI calling convention, even for GPRs.

Unless I'm missing something this isn't a function (nor globally
visible). It's internal
and just about everything that is saved/restored has already been clobbered by
acos/acosf.

The save/restore is for restoring state internal to acos/acosf. Think
that the amount
of state that is being preserved is unnecessarily large.

>
> Thanks,
> Florian
>
Sunil K Pandey Dec. 15, 2021, 10:26 p.m. UTC | #4
On Wed, Dec 15, 2021 at 12:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Dec 15, 2021 at 1:57 PM Florian Weimer <fweimer@redhat.com> wrote:
> >
> > * Noah Goldstein via Libc-alpha:
> >
> > > Is there a control flow from L(SPECIAL_VALUES_BRANCH) that doesn't go to exit?
> > > If not I think alot of the save/restores are pretty unnecissary.
> >
> > It depends on the vector calling convention.  It seems to be different
> > from the regular psABI calling convention, even for GPRs.
>
> Unless I'm missing something this isn't a function (nor globally
> visible). It's internal
> and just about everything that is saved/restored has already been clobbered by
> acos/acosf.
>
> The save/restore is for restoring state internal to acos/acosf. Think
> that the amount
> of state that is being preserved is unnecessarily large.

Thank you so much, we are working on this and will update in next version.

>
> >
> > Thanks,
> > Florian
> >
diff mbox series

Patch

diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
index b80ff332a0..2ccdd1fc53 100644
--- a/bits/libm-simd-decl-stubs.h
+++ b/bits/libm-simd-decl-stubs.h
@@ -98,4 +98,15 @@ 
 #define __DECL_SIMD_powf32x
 #define __DECL_SIMD_powf64x
 #define __DECL_SIMD_powf128x
+
+#define __DECL_SIMD_acos
+#define __DECL_SIMD_acosf
+#define __DECL_SIMD_acosl
+#define __DECL_SIMD_acosf16
+#define __DECL_SIMD_acosf32
+#define __DECL_SIMD_acosf64
+#define __DECL_SIMD_acosf128
+#define __DECL_SIMD_acosf32x
+#define __DECL_SIMD_acosf64x
+#define __DECL_SIMD_acosf128x
 #endif
diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
index da4cf4e10c..2cc6654208 100644
--- a/math/bits/mathcalls.h
+++ b/math/bits/mathcalls.h
@@ -50,7 +50,7 @@ 
 /* Trigonometric functions.  */
 
 /* Arc cosine of X.  */
-__MATHCALL (acos,, (_Mdouble_ __x));
+__MATHCALL_VEC (acos,, (_Mdouble_ __x));
 /* Arc sine of X.  */
 __MATHCALL (asin,, (_Mdouble_ __x));
 /* Arc tangent of X.  */
diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
index 363d4ace1e..b37b55777e 100644
--- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
@@ -46,3 +46,11 @@  GLIBC_2.22 _ZGVeN8v_log F
 GLIBC_2.22 _ZGVeN8v_sin F
 GLIBC_2.22 _ZGVeN8vv_pow F
 GLIBC_2.22 _ZGVeN8vvv_sincos F
+GLIBC_2.35 _ZGVbN2v_acos F
+GLIBC_2.35 _ZGVbN4v_acosf F
+GLIBC_2.35 _ZGVcN4v_acos F
+GLIBC_2.35 _ZGVcN8v_acosf F
+GLIBC_2.35 _ZGVdN4v_acos F
+GLIBC_2.35 _ZGVdN8v_acosf F
+GLIBC_2.35 _ZGVeN16v_acosf F
+GLIBC_2.35 _ZGVeN8v_acos F
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h
index dc0bfb3705..dabb74cbb9 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86/fpu/bits/math-vector.h
@@ -58,6 +58,10 @@ 
 #  define __DECL_SIMD_pow __DECL_SIMD_x86_64
 #  undef __DECL_SIMD_powf
 #  define __DECL_SIMD_powf __DECL_SIMD_x86_64
+#  undef __DECL_SIMD_acos
+#  define __DECL_SIMD_acos __DECL_SIMD_x86_64
+#  undef __DECL_SIMD_acosf
+#  define __DECL_SIMD_acosf __DECL_SIMD_x86_64
 
 # endif
 #endif
diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
index 311bb4e391..4bcbd1fbce 100644
--- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
@@ -28,6 +28,8 @@ 
 !GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
 
 !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@@ -41,3 +43,5 @@ 
 !GCC$ builtin (expf) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (pow) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig
index b0e3bf7887..7acf1f306c 100644
--- a/sysdeps/x86_64/fpu/Makeconfig
+++ b/sysdeps/x86_64/fpu/Makeconfig
@@ -22,6 +22,7 @@  postclean-generated += libmvec.mk
 
 # Define for both math and mathvec directories.
 libmvec-funcs = \
+  acos \
   cos \
   exp \
   log \
diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
index 08132045d6..3fd1379b17 100644
--- a/sysdeps/x86_64/fpu/Versions
+++ b/sysdeps/x86_64/fpu/Versions
@@ -12,5 +12,9 @@  libmvec {
     _ZGVbN4v_expf; _ZGVcN8v_expf; _ZGVdN8v_expf; _ZGVeN16v_expf;
     _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf;
     _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf;
+}
+  GLIBC_2.35 {
+    _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
+    _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
   }
 }
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 312575f933..85a568ed29 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -25,6 +25,26 @@  float: 1
 float128: 1
 ldouble: 2
 
+Function: "acos_vlen16":
+float: 1
+
+Function: "acos_vlen2":
+double: 1
+
+Function: "acos_vlen4":
+double: 1
+float: 2
+
+Function: "acos_vlen4_avx2":
+double: 1
+
+Function: "acos_vlen8":
+double: 1
+float: 2
+
+Function: "acos_vlen8_avx2":
+float: 1
+
 Function: "acosh":
 double: 2
 float: 2
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
new file mode 100644
index 0000000000..3aed563dde
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
@@ -0,0 +1,39 @@ 
+/* Common definition for libmathvec ifunc selections optimized with
+   AVX512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+#undef PASTER2
+#define PASTER2(x,y)   x##_##y
+
+extern void REDIRECT_NAME (void);
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ))
+    return OPTIMIZE (skx);
+
+  return OPTIMIZE (avx2_wrapper);
+}
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
new file mode 100644
index 0000000000..25fb8d0cac
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
@@ -0,0 +1,20 @@ 
+/* SSE2 version of vectorized acos, vector length is 2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2
+#include "../svml_d_acos2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
new file mode 100644
index 0000000000..5ba5d6fac2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
@@ -0,0 +1,27 @@ 
+/* Multiple versions of vectorized acos, vector length is 2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVbN2v_acos
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
new file mode 100644
index 0000000000..0c898e70ab
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
@@ -0,0 +1,399 @@ 
+/* Function acos vectorized with SSE4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define _SgnBit                       	0
+#define _OneHalf                      	64
+#define _SmallNorm                    	128
+#define _dRsqrtMsk                    	192
+#define _MOne                         	256
+#define _HalfMask                     	320
+#define _Two                          	384
+#define _sqrt_coeff                   	448
+#define _poly_coeff                   	512
+#define _PiL                          	704
+#define _PiH                          	768
+#define _Pi2L                         	832
+#define _Pi2H                         	896
+#define _Zero                         	960
+#define _SgnMask                      	1024
+#define _NanMask                      	1088
+#define _ep_coeff                     	1152
+#define _dInfs                        	1280
+#define _dOnes                        	1344
+#define _dZeros                       	1408
+
+#include <sysdep.h>
+
+        .text
+	.section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN2v_acos_sse4)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $256, %rsp
+        movups    __svml_dacos_data_internal(%rip), %xmm3
+        movups    _OneHalf+__svml_dacos_data_internal(%rip), %xmm6
+
+/* x = -|arg| */
+        movaps    %xmm3, %xmm2
+        orps      %xmm0, %xmm2
+
+/* Y = 0.5 + 0.5*(-x) */
+        movaps    %xmm6, %xmm4
+        mulpd     %xmm2, %xmm4
+        addpd     %xmm4, %xmm6
+
+/* S ~ 2*sqrt(Y) */
+        cvtpd2ps  %xmm6, %xmm7
+
+/* NaN processed in special branch (so wind test passed) */
+        movups    _MOne+__svml_dacos_data_internal(%rip), %xmm1
+
+/* x^2 */
+        movaps    %xmm2, %xmm5
+        cmpnlepd  %xmm2, %xmm1
+        mulpd     %xmm2, %xmm5
+        movmskpd  %xmm1, %edx
+        movlhps   %xmm7, %xmm7
+        andps     %xmm0, %xmm3
+        movups    %xmm8, 144(%rsp)
+        rsqrtps   %xmm7, %xmm1
+        minpd     %xmm6, %xmm5
+        cvtps2pd  %xmm1, %xmm8
+        movaps    %xmm6, %xmm1
+        movaps    %xmm5, %xmm4
+        cmpltpd   _SmallNorm+__svml_dacos_data_internal(%rip), %xmm1
+        cmpnltpd  %xmm6, %xmm4
+        addpd     %xmm6, %xmm6
+        andnps    %xmm8, %xmm1
+        movups    %xmm14, 176(%rsp)
+        movaps    %xmm1, %xmm14
+        mulpd     %xmm1, %xmm14
+        mulpd     %xmm6, %xmm1
+        mulpd     %xmm14, %xmm6
+        subpd     _Two+__svml_dacos_data_internal(%rip), %xmm6
+        movups    %xmm15, 160(%rsp)
+        movaps    %xmm6, %xmm8
+        movups    _sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm15
+        mulpd     %xmm6, %xmm15
+        mulpd     %xmm1, %xmm8
+        addpd     _sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm15
+        mulpd     %xmm6, %xmm15
+        addpd     _sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm15
+        mulpd     %xmm15, %xmm6
+        addpd     _sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6
+        mulpd     %xmm6, %xmm8
+
+/* polynomial */
+        movups    _poly_coeff+__svml_dacos_data_internal(%rip), %xmm6
+        movaps    %xmm5, %xmm15
+        mulpd     %xmm5, %xmm6
+        mulpd     %xmm5, %xmm15
+        addpd     _poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm6
+        subpd     %xmm8, %xmm1
+        mulpd     %xmm15, %xmm6
+        movups    _poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm14
+        andps     %xmm4, %xmm1
+        mulpd     %xmm5, %xmm14
+        movups    _poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm7
+        mulpd     %xmm5, %xmm7
+        addpd     _poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm14
+        addpd     _poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm7
+        addpd     %xmm6, %xmm14
+        mulpd     %xmm15, %xmm7
+        movups    _poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm8
+        movaps    %xmm15, %xmm6
+        mulpd     %xmm5, %xmm8
+        mulpd     %xmm15, %xmm6
+        addpd     _poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm8
+        mulpd     %xmm6, %xmm14
+        addpd     %xmm7, %xmm8
+        movups    _poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm7
+        mulpd     %xmm5, %xmm7
+        addpd     %xmm14, %xmm8
+        addpd     _poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm7
+        mulpd     %xmm15, %xmm8
+        movups    _poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm6
+
+/* X<X^2 iff X<0 */
+        movaps    %xmm0, %xmm14
+        addpd     %xmm8, %xmm7
+        cmpltpd   %xmm5, %xmm14
+        mulpd     %xmm5, %xmm6
+        mulpd     %xmm7, %xmm15
+        addpd     _poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm6
+        addpd     %xmm15, %xmm6
+        mulpd     %xmm5, %xmm6
+        movaps    %xmm4, %xmm7
+        movaps    %xmm4, %xmm5
+        andnps    %xmm2, %xmm7
+        orps      %xmm1, %xmm7
+        pxor      %xmm3, %xmm7
+        mulpd     %xmm7, %xmm6
+        movups    _PiH+__svml_dacos_data_internal(%rip), %xmm8
+        andps     %xmm4, %xmm8
+        andnps    _Pi2H+__svml_dacos_data_internal(%rip), %xmm5
+        andps     %xmm14, %xmm8
+        addpd     %xmm5, %xmm8
+        addpd     %xmm6, %xmm7
+        addpd     %xmm7, %xmm8
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movups    176(%rsp), %xmm14
+        cfi_restore(31)
+        movaps    %xmm8, %xmm0
+        movups    144(%rsp), %xmm8
+        cfi_restore(25)
+        movups    160(%rsp), %xmm15
+        cfi_restore(32)
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        movups    %xmm0, 128(%rsp)
+        movups    %xmm8, 192(%rsp)
+        xorl      %eax, %eax
+        movups    %xmm9, 64(%rsp)
+        movups    %xmm10, 48(%rsp)
+        movups    %xmm11, 32(%rsp)
+        movups    %xmm12, 16(%rsp)
+        movups    %xmm13, (%rsp)
+        movq      %rsi, 88(%rsp)
+        movq      %rdi, 80(%rsp)
+        movq      %r12, 112(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -144; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x70, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 104(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -152; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x68, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, 96(%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -160; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x60, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $2, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movups    64(%rsp), %xmm9
+        cfi_restore(26)
+        movups    48(%rsp), %xmm10
+        cfi_restore(27)
+        movups    32(%rsp), %xmm11
+        cfi_restore(28)
+        movups    16(%rsp), %xmm12
+        cfi_restore(29)
+        movups    (%rsp), %xmm13
+        cfi_restore(30)
+        movq      88(%rsp), %rsi
+        cfi_restore(4)
+        movq      80(%rsp), %rdi
+        cfi_restore(5)
+        movq      112(%rsp), %r12
+        cfi_restore(12)
+        movq      104(%rsp), %r13
+        cfi_restore(13)
+        movq      96(%rsp), %r14
+        cfi_restore(14)
+        movups    192(%rsp), %xmm8
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -144; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x70, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -152; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x68, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -160; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x60, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     128(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+        movsd     %xmm0, 192(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVbN2v_acos_sse4)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+    __declspec(align(64)) VUINT32 SgnBit[2][2];
+    __declspec(align(64)) VUINT32 OneHalf[2][2];
+    __declspec(align(64)) VUINT32 SmallNorm[2][2];
+    __declspec(align(64)) VUINT32 dRsqrtMsk[2][2];
+    __declspec(align(64)) VUINT32 MOne[2][2];
+    __declspec(align(64)) VUINT32 HalfMask[2][2];
+    __declspec(align(64)) VUINT32 Two[2][2];
+    __declspec(align(64)) VUINT32 sqrt_coeff[4][2][2];
+    __declspec(align(64)) VUINT32 poly_coeff[12][2][2];
+    __declspec(align(64)) VUINT32 PiL[2][2];
+    __declspec(align(64)) VUINT32 PiH[2][2];
+    __declspec(align(64)) VUINT32 Pi2L[2][2];
+    __declspec(align(64)) VUINT32 Pi2H[2][2];
+    __declspec(align(64)) VUINT32 Zero[2][2];
+    __declspec(align(64)) VUINT32 SgnMask[2][2];
+    __declspec(align(64)) VUINT32 NanMask[2][2];
+    __declspec(align(64)) VUINT32 ep_coeff[6][2][2];
+    /* scalar part follow */
+    __declspec(align(64)) VUINT32 dInfs[2][2];
+    __declspec(align(64)) VUINT32 dOnes[2][2];
+    __declspec(align(64)) VUINT32 dZeros[2][2];
+} __svml_dacos_data_internal_t;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 64
+        .quad 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 64
+        .quad 0x3000000000000000, 0x3000000000000000
+        /*== dRsqrtMsk ==*/
+        .align 64
+        .quad 0xffffff0000000000, 0xffffff0000000000
+        /*== MOne ==*/
+        .align 64
+        .quad 0xbff0000000000000, 0xbff0000000000000
+        /*== HalfMask ==*/
+        .align 64
+        .quad 0xfffffffffc000000, 0xfffffffffc000000
+        /*== Two ==*/
+        .align 64
+        .quad 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 64
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 64
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiL ==*/
+        .align 64
+        .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07
+        /*== PiH ==*/
+        .align 64
+        .quad 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2L ==*/
+        .align 64
+        .quad 0x3c91a62633145c07, 0x3c91a62633145c07
+        /*== Pi2H ==*/
+        .align 64
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        /*== Zero ==*/
+        .align 64
+        .quad 0x0000000000000000, 0x0000000000000000
+        /*== sgn mask ==*/
+        .align 64
+        .quad 0x8000000000000000, 0x8000000000000000
+        /*== NaN mask ==*/
+        .align 64
+        .quad 0xfffc000000000000, 0xfffc000000000000
+        /*== ep_coeff[6] ==*/
+        .align 64
+        .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
+        .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
+        .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
+        .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
+        .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
+        .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
+        /* scalar part follows */
+        /*== dInfs = DP infinity, +/- ==*/
+        .align 64
+        .quad 0x7ff0000000000000, 0xfff0000000000000
+        /*== dOnes = DP one, +/- ==*/
+        .align 64
+        .quad 0x3ff0000000000000, 0xbff0000000000000
+        /*== dZeros = DP zero +/- ==*/
+        .align 64
+        .quad 0x0000000000000000, 0x8000000000000000
+        .align 64
+        .type	__svml_dacos_data_internal,@object
+        .size	__svml_dacos_data_internal,1472
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
new file mode 100644
index 0000000000..750f71c81c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
@@ -0,0 +1,20 @@ 
+/* SSE version of vectorized acos, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper
+#include "../svml_d_acos4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
new file mode 100644
index 0000000000..6453e7ebe2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
@@ -0,0 +1,27 @@ 
+/* Multiple versions of vectorized acos, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVdN4v_acos
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
new file mode 100644
index 0000000000..684d501a3d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
@@ -0,0 +1,368 @@ 
+/* Function acos vectorized with AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define _SgnBit                       	0
+#define _OneHalf                      	64
+#define _SmallNorm                    	128
+#define _dRsqrtMsk                    	192
+#define _MOne                         	256
+#define _HalfMask                     	320
+#define _Two                          	384
+#define _sqrt_coeff                   	448
+#define _poly_coeff                   	576
+#define _PiL                          	960
+#define _PiH                          	1024
+#define _Pi2L                         	1088
+#define _Pi2H                         	1152
+#define _Zero                         	1216
+#define _SgnMask                      	1280
+#define _NanMask                      	1344
+#define _ep_coeff                     	1408
+#define _dInfs                        	1600
+#define _dOnes                        	1664
+#define _dZeros                       	1728
+
+#include <sysdep.h>
+
+        .text
+	.section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN4v_acos_avx2)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $384, %rsp
+        vmovupd   __svml_dacos_data_internal(%rip), %ymm6
+        vmovupd   _OneHalf+__svml_dacos_data_internal(%rip), %ymm7
+        vmovups   %ymm8, 96(%rsp)
+        vmovups   %ymm10, 192(%rsp)
+        vmovups   %ymm9, 160(%rsp)
+        vmovups   %ymm11, 224(%rsp)
+        vmovups   %ymm12, 256(%rsp)
+        vmovups   %ymm13, 288(%rsp)
+        vmovups   %ymm15, 352(%rsp)
+        vmovups   %ymm14, 320(%rsp)
+        vmovapd   %ymm0, %ymm5
+
+/* x = -|arg| */
+        vorpd     %ymm5, %ymm6, %ymm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231pd %ymm4, %ymm7, %ymm7
+
+/* x^2 */
+        vmulpd    %ymm4, %ymm4, %ymm1
+
+/* NaN processed in special branch (so wind test passed) */
+        vcmpnge_uqpd _MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm0
+
+/* S ~ 2*sqrt(Y) */
+        vcmplt_oqpd _SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm9
+        vaddpd    %ymm7, %ymm7, %ymm13
+        vminpd    %ymm7, %ymm1, %ymm2
+        vandpd    %ymm5, %ymm6, %ymm3
+        vcvtpd2ps %ymm7, %xmm6
+        vcmpnlt_uqpd %ymm7, %ymm2, %ymm1
+        vmovupd   _poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm7
+        vrsqrtps  %xmm6, %xmm8
+        vmovupd   _poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm6
+        vfmadd213pd _poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm7
+        vcvtps2pd %xmm8, %ymm10
+        vfmadd213pd _poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm6
+        vandnpd   %ymm10, %ymm9, %ymm11
+        vmulpd    %ymm11, %ymm11, %ymm12
+        vmulpd    %ymm13, %ymm11, %ymm15
+        vmovupd   _poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm9
+        vmulpd    %ymm2, %ymm2, %ymm11
+        vmovupd   _poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm10
+        vfmsub213pd _Two+__svml_dacos_data_internal(%rip), %ymm12, %ymm13
+        vmovupd   _poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm12
+        vfmadd213pd _poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm9
+        vmulpd    %ymm11, %ymm11, %ymm8
+        vfmadd213pd _poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm10
+        vmulpd    %ymm13, %ymm15, %ymm14
+        vfmadd213pd _poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm12
+        vfmadd213pd %ymm7, %ymm11, %ymm9
+        vmovmskpd %ymm0, %edx
+        vmovupd   _sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0
+        vfmadd213pd _sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
+        vfmadd213pd _sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
+        vfmadd213pd _sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm13, %ymm0
+
+/* polynomial */
+        vmovupd   _poly_coeff+__svml_dacos_data_internal(%rip), %ymm13
+        vfnmadd213pd %ymm15, %ymm14, %ymm0
+        vfmadd213pd _poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm13
+        vblendvpd %ymm1, %ymm0, %ymm4, %ymm4
+        vfmadd213pd %ymm6, %ymm11, %ymm13
+
+/* X<X^2 iff X<0 */
+        vcmplt_oqpd %ymm2, %ymm5, %ymm6
+        vfmadd213pd %ymm9, %ymm8, %ymm13
+        vfmadd213pd %ymm10, %ymm11, %ymm13
+        vfmadd213pd %ymm12, %ymm11, %ymm13
+        vmulpd    %ymm13, %ymm2, %ymm14
+        vxorpd    %ymm3, %ymm4, %ymm3
+        vandpd    _PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2
+        vfmadd213pd %ymm3, %ymm3, %ymm14
+        vandpd    %ymm6, %ymm2, %ymm2
+        vandnpd   _Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7
+        vaddpd    %ymm7, %ymm2, %ymm8
+        vaddpd    %ymm14, %ymm8, %ymm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        vmovups   96(%rsp), %ymm8
+        cfi_restore(91)
+        vmovups   160(%rsp), %ymm9
+        cfi_restore(92)
+        vmovups   192(%rsp), %ymm10
+        cfi_restore(93)
+        vmovups   224(%rsp), %ymm11
+        cfi_restore(94)
+        vmovups   256(%rsp), %ymm12
+        cfi_restore(95)
+        vmovups   288(%rsp), %ymm13
+        cfi_restore(96)
+        vmovups   320(%rsp), %ymm14
+        cfi_restore(97)
+        vmovups   352(%rsp), %ymm15
+        cfi_restore(98)
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovupd   %ymm5, 64(%rsp)
+        vmovupd   %ymm0, 128(%rsp)
+
+/* Go to exit */
+        je        L(EXIT)
+        xorl      %eax, %eax
+        vzeroupper
+        movq      %rsi, 8(%rsp)
+        movq      %rdi, (%rsp)
+        movq      %r12, 32(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 24(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, 16(%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $4, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movq      8(%rsp), %rsi
+        cfi_restore(4)
+        movq      (%rsp), %rdi
+        cfi_restore(5)
+        movq      32(%rsp), %r12
+        cfi_restore(12)
+        movq      24(%rsp), %r13
+        cfi_restore(13)
+        movq      16(%rsp), %r14
+        cfi_restore(14)
+        vmovupd   128(%rsp), %ymm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     64(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+        movsd     %xmm0, 128(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVdN4v_acos_avx2)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+    __declspec(align(64)) VUINT32 SgnBit[4][2];
+    __declspec(align(64)) VUINT32 OneHalf[4][2];
+    __declspec(align(64)) VUINT32 SmallNorm[4][2];
+    __declspec(align(64)) VUINT32 dRsqrtMsk[4][2];
+    __declspec(align(64)) VUINT32 MOne[4][2];
+    __declspec(align(64)) VUINT32 HalfMask[4][2];
+    __declspec(align(64)) VUINT32 Two[4][2];
+    __declspec(align(64)) VUINT32 sqrt_coeff[4][4][2];
+    __declspec(align(64)) VUINT32 poly_coeff[12][4][2];
+    __declspec(align(64)) VUINT32 PiL[4][2];
+    __declspec(align(64)) VUINT32 PiH[4][2];
+    __declspec(align(64)) VUINT32 Pi2L[4][2];
+    __declspec(align(64)) VUINT32 Pi2H[4][2];
+    __declspec(align(64)) VUINT32 Zero[4][2];
+    __declspec(align(64)) VUINT32 SgnMask[4][2];
+    __declspec(align(64)) VUINT32 NanMask[4][2];
+    __declspec(align(64)) VUINT32 ep_coeff[6][4][2];
+    /* scalar part follow */
+    __declspec(align(64)) VUINT32 dInfs[2][2];
+    __declspec(align(64)) VUINT32 dOnes[2][2];
+    __declspec(align(64)) VUINT32 dZeros[2][2];
+} __svml_dacos_data_internal_t;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 64
+        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 64
+        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
+        /*== dRsqrtMsk ==*/
+        .align 64
+        .quad 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000
+        /*== MOne ==*/
+        .align 64
+        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+        /*== HalfMask ==*/
+        .align 64
+        .quad 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000
+        /*== Two ==*/
+        .align 64
+        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 64
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 64
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiL ==*/
+        .align 64
+        .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07
+        /*== PiH ==*/
+        .align 64
+        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2L ==*/
+        .align 64
+        .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
+        /*== Pi2H ==*/
+        .align 64
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        /*== Zero ==*/
+        .align 64
+        .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+        /*== sgn mask ==*/
+        .align 64
+        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+        /*== NaN mask ==*/
+        .align 64
+        .quad 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000
+        /*== ep_coeff[6] ==*/
+        .align 64
+        .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
+        .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
+        .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
+        .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
+        .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
+        .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
+        /* scalar part follows */
+        /*== dInfs = DP infinity, +/- ==*/
+        .align 64
+        .quad 0x7ff0000000000000, 0xfff0000000000000
+        /*== dOnes = DP one, +/- ==*/
+        .align 64
+        .quad 0x3ff0000000000000, 0xbff0000000000000
+        /*== dZeros = DP zero +/- ==*/
+        .align 64
+        .quad 0x0000000000000000, 0x8000000000000000
+        .align 64
+        .type	__svml_dacos_data_internal,@object
+        .size	__svml_dacos_data_internal,1792
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
new file mode 100644
index 0000000000..4d64fd1c00
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
@@ -0,0 +1,20 @@ 
+/* AVX2 version of vectorized acos, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper
+#include "../svml_d_acos8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
new file mode 100644
index 0000000000..1e7d1865fb
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
@@ -0,0 +1,27 @@ 
+/* Multiple versions of vectorized acos, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVeN8v_acos
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
new file mode 100644
index 0000000000..52832893ec
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
@@ -0,0 +1,386 @@ 
+/* Function acos vectorized with AVX-512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define _SgnBit                       	0
+#define _OneHalf                      	64
+#define _SmallNorm                    	128
+#define _dRsqrtMsk                    	192
+#define _MOne                         	256
+#define _HalfMask                     	320
+#define _Two                          	384
+#define _sqrt_coeff_1                 	448
+#define _sqrt_coeff_2                 	512
+#define _sqrt_coeff_3                 	576
+#define _sqrt_coeff_4                 	640
+#define _poly_coeff_1                 	704
+#define _poly_coeff_2                 	768
+#define _poly_coeff_3                 	832
+#define _poly_coeff_4                 	896
+#define _poly_coeff_5                 	960
+#define _poly_coeff_6                 	1024
+#define _poly_coeff_7                 	1088
+#define _poly_coeff_8                 	1152
+#define _poly_coeff_9                 	1216
+#define _poly_coeff_10                	1280
+#define _poly_coeff_11                	1344
+#define _poly_coeff_12                	1408
+#define _PiL                          	1472
+#define _PiH                          	1536
+#define _Pi2L                         	1600
+#define _Pi2H                         	1664
+#define _Zero                         	1728
+#define _SgnMask                      	1792
+#define _NanMask                      	1856
+#define _ep_coeff_1                   	1920
+#define _ep_coeff_2                   	1984
+#define _ep_coeff_3                   	2048
+#define _ep_coeff_4                   	2112
+#define _ep_coeff_5                   	2176
+#define _ep_coeff_6                   	2240
+#define _dInfs                        	2304
+#define _dOnes                        	2368
+#define _dZeros                       	2432
+
+#include <sysdep.h>
+
+        .text
+	.section .text.evex512,"ax",@progbits
+ENTRY(_ZGVeN8v_acos_skx)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $256, %rsp
+        vmovups   __svml_dacos_data_internal(%rip), %zmm7
+        vmovups   _OneHalf+__svml_dacos_data_internal(%rip), %zmm8
+
+/* S ~ 2*sqrt(Y) */
+        vmovups   _SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
+        vmovups   _Two+__svml_dacos_data_internal(%rip), %zmm14
+        vmovups   _sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
+        vmovups   _sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
+        vmovups   _sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
+        vmovups   _MOne+__svml_dacos_data_internal(%rip), %zmm10
+        vmovaps   %zmm0, %zmm6
+
+/* x = -|arg| */
+        vorpd     %zmm6, %zmm7, %zmm5
+        vandpd    %zmm6, %zmm7, %zmm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
+
+/* x^2 */
+        vmulpd    {rn-sae}, %zmm5, %zmm5, %zmm9
+        vrsqrt14pd %zmm8, %zmm12
+        vcmppd    $17, {sae}, %zmm11, %zmm8, %k2
+        vcmppd    $17, {sae}, %zmm10, %zmm5, %k0
+        vmovups   _poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
+        vmovups   _poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
+        vminpd    {sae}, %zmm8, %zmm9, %zmm3
+        vmovups   _poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
+        vxorpd    %zmm12, %zmm12, %zmm12{%k2}
+        vaddpd    {rn-sae}, %zmm8, %zmm8, %zmm0
+        vcmppd    $21, {sae}, %zmm8, %zmm3, %k1
+
+/* X<X^2 iff X<0 */
+        vcmppd    $17, {sae}, %zmm3, %zmm6, %k3
+        vmulpd    {rn-sae}, %zmm12, %zmm12, %zmm13
+        vmulpd    {rn-sae}, %zmm12, %zmm0, %zmm7
+        vmovups   _poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
+
+/* polynomial */
+        vmovups   _poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
+        vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
+        vmovups   _sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
+        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
+        vmovups   _poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
+        vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
+        vmovups   _poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
+        vmulpd    {rn-sae}, %zmm0, %zmm7, %zmm14
+        vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
+        vmovups   _poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
+        kmovw     %k1, %eax
+        kmovw     %k3, %ecx
+        kmovw     %k0, %edx
+        vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
+        vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
+        vmovups   _poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
+        vmulpd    {rn-sae}, %zmm3, %zmm3, %zmm0
+        vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
+        vmovups   _poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
+        vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
+        vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
+        vblendmpd %zmm2, %zmm5, %zmm2{%k1}
+        vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
+        vmovups   _poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
+        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
+        andl      %eax, %ecx
+        vmovups   _poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
+        kmovw     %ecx, %k2
+        vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
+        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
+        vmulpd    {rn-sae}, %zmm0, %zmm0, %zmm10
+        vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
+        vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
+        vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
+        vmovups   _Pi2H+__svml_dacos_data_internal(%rip), %zmm0
+        vmulpd    {rn-sae}, %zmm3, %zmm1, %zmm1
+        vxorpd    %zmm4, %zmm2, %zmm3
+        vxorpd    %zmm0, %zmm0, %zmm0{%k1}
+        vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
+        vorpd     _PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k2}
+        vaddpd    {rn-sae}, %zmm1, %zmm0, %zmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %zmm6, 128(%rsp)
+        vmovups   %zmm0, 192(%rsp)
+
+/* Go to exit */
+        je        L(EXIT)
+        xorl      %eax, %eax
+        vzeroupper
+        kmovw     %k4, 24(%rsp)
+        kmovw     %k5, 16(%rsp)
+        kmovw     %k6, 8(%rsp)
+        kmovw     %k7, (%rsp)
+        movq      %rsi, 40(%rsp)
+        movq      %rdi, 32(%rsp)
+        movq      %r12, 64(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 56(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, 48(%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $8, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        kmovw     24(%rsp), %k4
+        cfi_restore(122)
+        kmovw     16(%rsp), %k5
+        cfi_restore(123)
+        kmovw     8(%rsp), %k6
+        cfi_restore(124)
+        kmovw     (%rsp), %k7
+        cfi_restore(125)
+        vmovups   192(%rsp), %zmm0
+        movq      40(%rsp), %rsi
+        cfi_restore(4)
+        movq      32(%rsp), %rdi
+        cfi_restore(5)
+        movq      64(%rsp), %r12
+        cfi_restore(12)
+        movq      56(%rsp), %r13
+        cfi_restore(13)
+        movq      48(%rsp), %r14
+        cfi_restore(14)
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     128(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+        movsd     %xmm0, 192(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVeN8v_acos_skx)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+    __declspec(align(64)) VUINT32 SgnBit[8][2];
+    __declspec(align(64)) VUINT32 OneHalf[8][2];
+    __declspec(align(64)) VUINT32 SmallNorm[8][2];
+    __declspec(align(64)) VUINT32 dRsqrtMsk[8][2];
+    __declspec(align(64)) VUINT32 MOne[8][2];
+    __declspec(align(64)) VUINT32 HalfMask[8][2];
+    __declspec(align(64)) VUINT32 Two[8][2];
+    __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
+    __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
+    __declspec(align(64)) VUINT32 PiL[8][2];
+    __declspec(align(64)) VUINT32 PiH[8][2];
+    __declspec(align(64)) VUINT32 Pi2L[8][2];
+    __declspec(align(64)) VUINT32 Pi2H[8][2];
+    __declspec(align(64)) VUINT32 Zero[8][2];
+    __declspec(align(64)) VUINT32 SgnMask[8][2];
+    __declspec(align(64)) VUINT32 NanMask[8][2];
+    __declspec(align(64)) VUINT32 ep_coeff[6][8][2];
+    /* scalar part follow */
+    __declspec(align(64)) VUINT32 dInfs[2][2];
+    __declspec(align(64)) VUINT32 dOnes[2][2];
+    __declspec(align(64)) VUINT32 dZeros[2][2];
+} __svml_dacos_data_internal_t;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 64
+        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 64
+        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
+        /*== dRsqrtMsk ==*/
+        .align 64
+        .quad 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000, 0xffffff0000000000
+        /*== MOne ==*/
+        .align 64
+        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+        /*== HalfMask ==*/
+        .align 64
+        .quad 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000, 0xfffffffffc000000
+        /*== Two ==*/
+        .align 64
+        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 64
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 64
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiL ==*/
+        .align 64
+        .quad 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07, 0x3ca1a62633145c07
+        /*== PiH ==*/
+        .align 64
+        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2L ==*/
+        .align 64
+        .quad 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07, 0x3c91a62633145c07
+        /*== Pi2H ==*/
+        .align 64
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        /*== Zero ==*/
+        .align 64
+        .quad 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+        /*== sgn mask ==*/
+        .align 64
+        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+        /*== NaN mask ==*/
+        .align 64
+        .quad 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000, 0xfffc000000000000
+        /*== ep_coeff[6] ==*/
+        .align 64
+        .quad 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E, 0x3fa36C5AF645A11E /* ep_coeff6 */
+        .quad 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282, 0x3f8CE147EA9E9282 /* ep_coeff5 */
+        .quad 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155, 0x3fa056B4151FA155 /* ep_coeff4 */
+        .quad 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54, 0x3fa6C8ED2A4CCE54 /* ep_coeff3 */
+        .quad 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A, 0x3fb33399EBF85B6A /* ep_coeff2 */
+        .quad 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45, 0x3fc5555480C83A45 /* ep_coeff1 */
+        /* scalar part follows */
+        /*== dInfs = DP infinity, +/- ==*/
+        .align 64
+        .quad 0x7ff0000000000000, 0xfff0000000000000
+        /*== dOnes = DP one, +/- ==*/
+        .align 64
+        .quad 0x3ff0000000000000, 0xbff0000000000000
+        /*== dZeros = DP zero +/- ==*/
+        .align 64
+        .quad 0x0000000000000000, 0x8000000000000000
+        .align 64
+        .type	__svml_dacos_data_internal,@object
+        .size	__svml_dacos_data_internal,2496
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
new file mode 100644
index 0000000000..1ff0cfc8d5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
@@ -0,0 +1,20 @@ 
+/* AVX2 version of vectorized acosf.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper
+#include "../svml_s_acosf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
new file mode 100644
index 0000000000..fcf05782c5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
@@ -0,0 +1,28 @@ 
+/* Multiple versions of vectorized acosf, vector length is 16.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVeN16v_acosf
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf,
+	       __redirect__ZGVeN16v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
new file mode 100644
index 0000000000..d30b04a607
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
@@ -0,0 +1,332 @@ 
+/* Function acosf vectorized with AVX-512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define _SgnBit                       	0
+#define _OneHalf                      	64
+#define _sRsqrtMsk                    	128
+#define _SmallNorm                    	192
+#define _MOne                         	256
+#define _HalfMask                     	320
+#define _SQMask                       	384
+#define _Two                          	448
+#define _sqrt_coeff_1                 	512
+#define _sqrt_coeff_2                 	576
+#define _poly_coeff_1                 	640
+#define _poly_coeff_2                 	704
+#define _poly_coeff_3                 	768
+#define _poly_coeff_4                 	832
+#define _poly_coeff_5                 	896
+#define _Pi2H                         	960
+#define _Pi2L                         	1024
+#define _PiH                          	1088
+#define _PiL                          	1152
+#define _Zero                         	1216
+#define _SgnMask                      	1280
+#define _NanMask                      	1344
+#define _ep_coeff_1                   	1408
+#define _ep_coeff_2                   	1472
+#define _ep_coeff_3                   	1536
+
+#include <sysdep.h>
+
+        .text
+	.section .text.exex512,"ax",@progbits
+ENTRY(_ZGVeN16v_acosf_skx)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $256, %rsp
+        vmovups   __svml_sacos_data_internal(%rip), %zmm5
+        vmovups   _OneHalf+__svml_sacos_data_internal(%rip), %zmm6
+
+/* SQ ~ 2*sqrt(Y) */
+        vmovups   _SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
+        vmovups   _MOne+__svml_sacos_data_internal(%rip), %zmm8
+        vmovups   _Two+__svml_sacos_data_internal(%rip), %zmm12
+        vmovups   _sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
+        vmovaps   %zmm0, %zmm4
+
+/* x = -|arg| */
+        vorps     %zmm4, %zmm5, %zmm3
+        vandps    %zmm4, %zmm5, %zmm2
+        vmovups   _sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
+
+/* x^2 */
+        vmulps    {rn-sae}, %zmm3, %zmm3, %zmm7
+        vrsqrt14ps %zmm6, %zmm10
+        vcmpps    $17, {sae}, %zmm9, %zmm6, %k2
+        vcmpps    $22, {sae}, %zmm3, %zmm8, %k0
+        vmovups   _poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
+        vminps    {sae}, %zmm6, %zmm7, %zmm1
+        vmovups   _poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
+        vxorps    %zmm10, %zmm10, %zmm10{%k2}
+        vaddps    {rn-sae}, %zmm6, %zmm6, %zmm14
+        vmulps    {rn-sae}, %zmm1, %zmm1, %zmm8
+        vmulps    {rn-sae}, %zmm10, %zmm10, %zmm11
+        vmulps    {rn-sae}, %zmm10, %zmm14, %zmm5
+        vcmpps    $21, {sae}, %zmm6, %zmm1, %k1
+
+/* X<X^2 iff X<0 */
+        vcmpps    $17, {sae}, %zmm1, %zmm4, %k3
+
+/* polynomial */
+        vmovups   _poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
+        vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
+        vmovups   _poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
+        vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
+        vmovups   _poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
+        vmovups   _Pi2H+__svml_sacos_data_internal(%rip), %zmm12
+        vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
+        vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
+        vmulps    {rn-sae}, %zmm14, %zmm5, %zmm15
+        vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
+        vxorps    %zmm12, %zmm12, %zmm12{%k1}
+        vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
+        vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
+        kmovw     %k1, %eax
+        kmovw     %k3, %ecx
+        kmovw     %k0, %edx
+        vmulps    {rn-sae}, %zmm1, %zmm11, %zmm13
+        vblendmps %zmm0, %zmm3, %zmm0{%k1}
+        vxorps    %zmm2, %zmm0, %zmm1
+        andl      %eax, %ecx
+        kmovw     %ecx, %k2
+        vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
+        vorps     _PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k2}
+        vaddps    {rn-sae}, %zmm13, %zmm12, %zmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %zmm4, 128(%rsp)
+        vmovups   %zmm0, 192(%rsp)
+
+/* Go to exit */
+        je        L(EXIT)
+        xorl      %eax, %eax
+        vzeroupper
+        kmovw     %k4, 24(%rsp)
+        kmovw     %k5, 16(%rsp)
+        kmovw     %k6, 8(%rsp)
+        kmovw     %k7, (%rsp)
+        movq      %rsi, 40(%rsp)
+        movq      %rdi, 32(%rsp)
+        movq      %r12, 64(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 56(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, 48(%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $16, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        kmovw     24(%rsp), %k4
+        cfi_restore(122)
+        kmovw     16(%rsp), %k5
+        cfi_restore(123)
+        kmovw     8(%rsp), %k6
+        cfi_restore(124)
+        kmovw     (%rsp), %k7
+        cfi_restore(125)
+        vmovups   192(%rsp), %zmm0
+        movq      40(%rsp), %rsi
+        cfi_restore(4)
+        movq      32(%rsp), %rdi
+        cfi_restore(5)
+        movq      64(%rsp), %r12
+        cfi_restore(12)
+        movq      56(%rsp), %r13
+        cfi_restore(13)
+        movq      48(%rsp), %r14
+        cfi_restore(14)
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -200; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x38, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -208; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x30, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     128(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+        movss     %xmm0, 192(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVeN16v_acosf_skx)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+    __declspec(align(64)) VUINT32 SgnBit[16][1];
+    __declspec(align(64)) VUINT32 OneHalf[16][1];
+    __declspec(align(64)) VUINT32 sRsqrtMsk[16][1];
+    __declspec(align(64)) VUINT32 SmallNorm[16][1];
+    __declspec(align(64)) VUINT32 MOne[16][1];
+    __declspec(align(64)) VUINT32 HalfMask[16][1];
+    __declspec(align(64)) VUINT32 SQMask[16][1];
+    __declspec(align(64)) VUINT32 Two[16][1];
+    __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
+    __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
+    __declspec(align(64)) VUINT32 Pi2H[16][1];
+    __declspec(align(64)) VUINT32 Pi2L[16][1];
+    __declspec(align(64)) VUINT32 PiH[16][1];
+    __declspec(align(64)) VUINT32 PiL[16][1];
+    __declspec(align(64)) VUINT32 Zero[16][1];
+    __declspec(align(64)) VUINT32 SgnMask[16][1];
+    __declspec(align(64)) VUINT32 NanMask[16][1];
+    __declspec(align(64)) VUINT32 ep_coeff[3][16][1];
+} __svml_sacos_data_internal_t;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 64
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== sRsqrtMsk ==*/
+        .align 64
+        .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
+        /*== SmallNorm ==*/
+        .align 64
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 64
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== HalfMask ==*/
+        .align 64
+        .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
+        /*== SQMask ==*/
+        .align 64
+        .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
+        /*== Two ==*/
+        .align 64
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 64
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 64
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 64
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== Pi2L ==*/
+        .align 64
+        .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
+        /*== PiH ==*/
+        .align 64
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        /*== PiL ==*/
+        .align 64
+        .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
+        /*== zero ==*/
+        .align 64
+        .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
+        /*== sgn mask ==*/
+        .align 64
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== nan mask ==*/
+        .align 64
+        .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
+        /*== ep_coeff[3] ==*/
+        .align 64
+        .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
+        .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
+        .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
+        .align 64
+        .type	__svml_sacos_data_internal,@object
+        .size	__svml_sacos_data_internal,1600
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
new file mode 100644
index 0000000000..f94b3eb01a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
@@ -0,0 +1,20 @@ 
+/* SSE2 version of vectorized acosf, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2
+#include "../svml_s_acosf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
new file mode 100644
index 0000000000..6f9a5c1082
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
@@ -0,0 +1,28 @@ 
+/* Multiple versions of vectorized acosf, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVbN4v_acosf
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf,
+	       __redirect__ZGVbN4v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
new file mode 100644
index 0000000000..acfdc348aa
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
@@ -0,0 +1,351 @@ 
+/* Function acosf vectorized with SSE4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define _SgnBit                       	0
+#define _OneHalf                      	64
+#define _sRsqrtMsk                    	128
+#define _SmallNorm                    	192
+#define _MOne                         	256
+#define _HalfMask                     	320
+#define _SQMask                       	384
+#define _Two                          	448
+#define _sqrt_coeff                   	512
+#define _poly_coeff                   	576
+#define _Pi2H                         	704
+#define _Pi2L                         	768
+#define _PiH                          	832
+#define _PiL                          	896
+#define _Zero                         	960
+#define _SgnMask                      	1024
+#define _NanMask                      	1088
+#define _ep_coeff                     	1152
+
+#include <sysdep.h>
+
+        .text
+	.section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN4v_acosf_sse4)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $320, %rsp
+        movaps    %xmm0, %xmm6
+
+/*
+ * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
+ * SQ ~ 2*sqrt(X)
+ */
+        movups    __svml_sacos_data_internal(%rip), %xmm5
+        movups    _OneHalf+__svml_sacos_data_internal(%rip), %xmm0
+
+/* x = -|arg| */
+        movaps    %xmm5, %xmm7
+        orps      %xmm6, %xmm7
+
+/* Y = 0.5 + 0.5*(-x) */
+        movaps    %xmm0, %xmm2
+        mulps     %xmm7, %xmm2
+
+/* x^2 */
+        movaps    %xmm7, %xmm3
+        mulps     %xmm7, %xmm3
+        addps     %xmm2, %xmm0
+        movups    _MOne+__svml_sacos_data_internal(%rip), %xmm4
+        andps     %xmm6, %xmm5
+        cmpnleps  %xmm7, %xmm4
+        minps     %xmm0, %xmm3
+
+/* SQ ~ 2*sqrt(Y) */
+        rsqrtps   %xmm0, %xmm1
+        movmskps  %xmm4, %edx
+        movaps    %xmm0, %xmm4
+        movaps    %xmm3, %xmm2
+        movups    %xmm8, 160(%rsp)
+        cmpltps   _SmallNorm+__svml_sacos_data_internal(%rip), %xmm4
+        cmpnltps  %xmm0, %xmm2
+        addps     %xmm0, %xmm0
+        andnps    %xmm1, %xmm4
+        movaps    %xmm4, %xmm8
+        mulps     %xmm4, %xmm8
+        mulps     %xmm0, %xmm4
+        mulps     %xmm8, %xmm0
+        movups    _sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm1
+
+/* polynomial */
+        movups    _poly_coeff+__svml_sacos_data_internal(%rip), %xmm8
+        mulps     %xmm3, %xmm8
+        subps     _Two+__svml_sacos_data_internal(%rip), %xmm0
+        mulps     %xmm0, %xmm1
+        addps     _poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm8
+        mulps     %xmm4, %xmm0
+        addps     _sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm1
+        mulps     %xmm0, %xmm1
+        movaps    %xmm3, %xmm0
+        mulps     %xmm3, %xmm0
+        subps     %xmm1, %xmm4
+        mulps     %xmm0, %xmm8
+        movups    _poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm1
+        andps     %xmm2, %xmm4
+        mulps     %xmm3, %xmm1
+        movups    _PiH+__svml_sacos_data_internal(%rip), %xmm0
+        andps     %xmm2, %xmm0
+        addps     _poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm1
+        addps     %xmm8, %xmm1
+
+/* X<X^2 iff X<0 */
+        movaps    %xmm6, %xmm8
+        cmpltps   %xmm3, %xmm8
+        mulps     %xmm3, %xmm1
+        andps     %xmm8, %xmm0
+        movaps    %xmm2, %xmm8
+        andnps    %xmm7, %xmm8
+        addps     _poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm1
+        mulps     %xmm3, %xmm1
+        orps      %xmm4, %xmm8
+        pxor      %xmm5, %xmm8
+        movaps    %xmm2, %xmm3
+        mulps     %xmm8, %xmm1
+        andnps    _Pi2H+__svml_sacos_data_internal(%rip), %xmm3
+        addps     %xmm1, %xmm8
+        addps     %xmm3, %xmm0
+        addps     %xmm8, %xmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movups    160(%rsp), %xmm8
+        cfi_restore(25)
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        movups    %xmm6, 192(%rsp)
+        movups    %xmm0, 256(%rsp)
+        xorl      %eax, %eax
+        movups    %xmm9, 96(%rsp)
+        movups    %xmm10, 80(%rsp)
+        movups    %xmm11, 64(%rsp)
+        movups    %xmm12, 48(%rsp)
+        movups    %xmm13, 32(%rsp)
+        movups    %xmm14, 16(%rsp)
+        movups    %xmm15, (%rsp)
+        movq      %rsi, 120(%rsp)
+        movq      %rdi, 112(%rsp)
+        movq      %r12, 144(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 136(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, 128(%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $4, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movups    96(%rsp), %xmm9
+        cfi_restore(26)
+        movups    80(%rsp), %xmm10
+        cfi_restore(27)
+        movups    64(%rsp), %xmm11
+        cfi_restore(28)
+        movups    48(%rsp), %xmm12
+        cfi_restore(29)
+        movups    32(%rsp), %xmm13
+        cfi_restore(30)
+        movups    16(%rsp), %xmm14
+        cfi_restore(31)
+        movups    (%rsp), %xmm15
+        cfi_restore(32)
+        movq      120(%rsp), %rsi
+        cfi_restore(4)
+        movq      112(%rsp), %rdi
+        cfi_restore(5)
+        movq      144(%rsp), %r12
+        cfi_restore(12)
+        movq      136(%rsp), %r13
+        cfi_restore(13)
+        movq      128(%rsp), %r14
+        cfi_restore(14)
+        movups    256(%rsp), %xmm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     192(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+        movss     %xmm0, 256(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVbN4v_acosf_sse4)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+    __declspec(align(64)) VUINT32 SgnBit[4][1];
+    __declspec(align(64)) VUINT32 OneHalf[4][1];
+    __declspec(align(64)) VUINT32 sRsqrtMsk[4][1];
+    __declspec(align(64)) VUINT32 SmallNorm[4][1];
+    __declspec(align(64)) VUINT32 MOne[4][1];
+    __declspec(align(64)) VUINT32 HalfMask[4][1];
+    __declspec(align(64)) VUINT32 SQMask[4][1];
+    __declspec(align(64)) VUINT32 Two[4][1];
+    __declspec(align(64)) VUINT32 sqrt_coeff[2][4][1];
+    __declspec(align(64)) VUINT32 poly_coeff[5][4][1];
+    __declspec(align(64)) VUINT32 Pi2H[4][1];
+    __declspec(align(64)) VUINT32 Pi2L[4][1];
+    __declspec(align(64)) VUINT32 PiH[4][1];
+    __declspec(align(64)) VUINT32 PiL[4][1];
+    __declspec(align(64)) VUINT32 Zero[4][1];
+    __declspec(align(64)) VUINT32 SgnMask[4][1];
+    __declspec(align(64)) VUINT32 NanMask[4][1];
+    __declspec(align(64)) VUINT32 ep_coeff[3][4][1];
+} __svml_sacos_data_internal_t;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 64
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== sRsqrtMsk ==*/
+        .align 64
+        .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
+        /*== SmallNorm ==*/
+        .align 64
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 64
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== HalfMask ==*/
+        .align 64
+        .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
+        /*== SQMask ==*/
+        .align 64
+        .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
+        /*== Two ==*/
+        .align 64
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 64
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 64
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 64
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== Pi2L ==*/
+        .align 64
+        .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
+        /*== PiH ==*/
+        .align 64
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        /*== PiL ==*/
+        .align 64
+        .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
+        /*== zero ==*/
+        .align 64
+        .long 0x00000000, 0x00000000, 0x00000000, 0x00000000
+        /*== sgn mask ==*/
+        .align 64
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== nan mask ==*/
+        .align 64
+        .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
+        /*== ep_coeff[3] ==*/
+        .align 64
+        .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
+        .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
+        .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
+        .align 64
+        .type	__svml_sacos_data_internal,@object
+        .size	__svml_sacos_data_internal,1216
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
new file mode 100644
index 0000000000..583ef54fee
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
@@ -0,0 +1,20 @@ 
+/* SSE version of vectorized acosf, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper
+#include "../svml_s_acosf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
new file mode 100644
index 0000000000..dd360a9479
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
@@ -0,0 +1,28 @@ 
+/* Multiple versions of vectorized acosf, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVdN8v_acosf
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf,
+	       __redirect__ZGVdN8v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
new file mode 100644
index 0000000000..6d800f9aa4
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
@@ -0,0 +1,332 @@ 
+/* Function acosf vectorized with AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define _SgnBit                       	0
+#define _OneHalf                      	64
+#define _sRsqrtMsk                    	128
+#define _SmallNorm                    	192
+#define _MOne                         	256
+#define _HalfMask                     	320
+#define _SQMask                       	384
+#define _Two                          	448
+#define _sqrt_coeff                   	512
+#define _poly_coeff                   	576
+#define _Pi2H                         	768
+#define _Pi2L                         	832
+#define _PiH                          	896
+#define _PiL                          	960
+#define _Zero                         	1024
+#define _SgnMask                      	1088
+#define _NanMask                      	1152
+#define _ep_coeff                     	1216
+
+#include <sysdep.h>
+
+        .text
+	.section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN8v_acosf_avx2)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $384, %rsp
+
+/*
+ * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
+ * SQ ~ 2*sqrt(X)
+ */
+        vmovups   __svml_sacos_data_internal(%rip), %ymm6
+        vmovups   _OneHalf+__svml_sacos_data_internal(%rip), %ymm7
+        vmovups   %ymm8, 288(%rsp)
+        vmovups   %ymm15, 352(%rsp)
+        vmovups   %ymm9, 96(%rsp)
+        vmovups   _poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm15
+        vmovups   %ymm10, 160(%rsp)
+        vmovups   %ymm11, 192(%rsp)
+        vmovups   %ymm14, 320(%rsp)
+        vmovups   %ymm13, 256(%rsp)
+        vmovups   %ymm12, 224(%rsp)
+        vmovaps   %ymm0, %ymm5
+
+/* x = -|arg| */
+        vorps     %ymm5, %ymm6, %ymm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231ps %ymm4, %ymm7, %ymm7
+
+/* x^2 */
+        vmulps    %ymm4, %ymm4, %ymm1
+        vcmpnge_uqps _MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm0
+
+/* SQ ~ 2*sqrt(Y) */
+        vaddps    %ymm7, %ymm7, %ymm11
+        vminps    %ymm7, %ymm1, %ymm2
+        vrsqrtps  %ymm7, %ymm8
+        vfmadd213ps _poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm15
+        vmulps    %ymm2, %ymm2, %ymm14
+        vcmpnlt_uqps %ymm7, %ymm2, %ymm1
+        vandps    %ymm5, %ymm6, %ymm3
+        vcmplt_oqps _SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm6
+        vandnps   %ymm8, %ymm6, %ymm9
+
+/* polynomial */
+        vmovups   _poly_coeff+__svml_sacos_data_internal(%rip), %ymm6
+        vmulps    %ymm9, %ymm9, %ymm10
+        vmulps    %ymm11, %ymm9, %ymm13
+        vfmadd213ps _poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm6
+        vfmsub213ps _Two+__svml_sacos_data_internal(%rip), %ymm10, %ymm11
+        vfmadd213ps %ymm15, %ymm14, %ymm6
+        vmulps    %ymm11, %ymm13, %ymm12
+        vfmadd213ps _poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm6
+        vmovmskps %ymm0, %edx
+        vmovups   _sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
+        vmulps    %ymm6, %ymm2, %ymm9
+
+/* X<X^2 iff X<0 */
+        vcmplt_oqps %ymm2, %ymm5, %ymm6
+        vfmadd213ps _sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm11, %ymm0
+        vfnmadd213ps %ymm13, %ymm12, %ymm0
+        vblendvps %ymm1, %ymm0, %ymm4, %ymm4
+        vxorps    %ymm3, %ymm4, %ymm3
+        vandps    _PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
+        vfmadd213ps %ymm3, %ymm3, %ymm9
+        vandps    %ymm6, %ymm2, %ymm2
+        vandnps   _Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm7
+        vaddps    %ymm7, %ymm2, %ymm8
+        vaddps    %ymm9, %ymm8, %ymm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        vmovups   288(%rsp), %ymm8
+        cfi_restore(91)
+        vmovups   96(%rsp), %ymm9
+        cfi_restore(92)
+        vmovups   160(%rsp), %ymm10
+        cfi_restore(93)
+        vmovups   192(%rsp), %ymm11
+        cfi_restore(94)
+        vmovups   224(%rsp), %ymm12
+        cfi_restore(95)
+        vmovups   256(%rsp), %ymm13
+        cfi_restore(96)
+        vmovups   320(%rsp), %ymm14
+        cfi_restore(97)
+        vmovups   352(%rsp), %ymm15
+        cfi_restore(98)
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %ymm5, 64(%rsp)
+        vmovups   %ymm0, 128(%rsp)
+
+/* Go to exit */
+        je        L(EXIT)
+        xorl      %eax, %eax
+        vzeroupper
+        movq      %rsi, 8(%rsp)
+        movq      %rdi, (%rsp)
+        movq      %r12, 32(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 24(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, 16(%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $8, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movq      8(%rsp), %rsi
+        cfi_restore(4)
+        movq      (%rsp), %rdi
+        cfi_restore(5)
+        movq      32(%rsp), %r12
+        cfi_restore(12)
+        movq      24(%rsp), %r13
+        cfi_restore(13)
+        movq      16(%rsp), %r14
+        cfi_restore(14)
+        vmovups   128(%rsp), %ymm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -352; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xfe, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -360; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x98, 0xfe, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -368; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x90, 0xfe, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     64(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+        movss     %xmm0, 128(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVdN8v_acosf_avx2)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+    __declspec(align(64)) VUINT32 SgnBit[8][1];
+    __declspec(align(64)) VUINT32 OneHalf[8][1];
+    __declspec(align(64)) VUINT32 sRsqrtMsk[8][1];
+    __declspec(align(64)) VUINT32 SmallNorm[8][1];
+    __declspec(align(64)) VUINT32 MOne[8][1];
+    __declspec(align(64)) VUINT32 HalfMask[8][1];
+    __declspec(align(64)) VUINT32 SQMask[8][1];
+    __declspec(align(64)) VUINT32 Two[8][1];
+    __declspec(align(64)) VUINT32 sqrt_coeff[2][8][1];
+    __declspec(align(64)) VUINT32 poly_coeff[5][8][1];
+    __declspec(align(64)) VUINT32 Pi2H[8][1];
+    __declspec(align(64)) VUINT32 Pi2L[8][1];
+    __declspec(align(64)) VUINT32 PiH[8][1];
+    __declspec(align(64)) VUINT32 PiL[8][1];
+    __declspec(align(64)) VUINT32 Zero[8][1];
+    __declspec(align(64)) VUINT32 SgnMask[8][1];
+    __declspec(align(64)) VUINT32 NanMask[8][1];
+    __declspec(align(64)) VUINT32 ep_coeff[3][8][1];
+} __svml_sacos_data_internal_t;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 64
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== sRsqrtMsk ==*/
+        .align 64
+        .long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
+        /*== SmallNorm ==*/
+        .align 64
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 64
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== HalfMask ==*/
+        .align 64
+        .long 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000, 0xffffe000
+        /*== SQMask ==*/
+        .align 64
+        .long 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800, 0xfffff800
+        /*== Two ==*/
+        .align 64
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 64
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 64
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 64
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== Pi2L ==*/
+        .align 64
+        .long 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E, 0xB33BBD2E
+        /*== PiH ==*/
+        .align 64
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        /*== PiL ==*/
+        .align 64
+        .long 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E, 0xB3BBBD2E
+        /*== zero ==*/
+        .align 64
+        .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
+        /*== sgn mask ==*/
+        .align 64
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== nan mask ==*/
+        .align 64
+        .long 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000, 0xffc00000
+        /*== ep_coeff[3] ==*/
+        .align 64
+        .long 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE, 0x3dC4C6AE /* coeff2 */
+        .long 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2, 0x3e2876B2 /* coeff1 */
+        .long 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3, 0x380561A3 /* coeff0 */
+        .align 64
+        .type	__svml_sacos_data_internal,@object
+        .size	__svml_sacos_data_internal,1344
diff --git a/sysdeps/x86_64/fpu/svml_d_acos2_core.S b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
new file mode 100644
index 0000000000..9656478b2d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
@@ -0,0 +1,29 @@ 
+/* Function acos vectorized with SSE2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN2v_acos)
+WRAPPER_IMPL_SSE2 acos
+END (_ZGVbN2v_acos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_acos)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core.S b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
new file mode 100644
index 0000000000..e99cb4ae78
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
@@ -0,0 +1,29 @@ 
+/* Function acos vectorized with AVX2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN4v_acos)
+WRAPPER_IMPL_AVX _ZGVbN2v_acos
+END (_ZGVdN4v_acos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_acos)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
new file mode 100644
index 0000000000..7cbcbc965c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
@@ -0,0 +1,25 @@ 
+/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVcN4v_acos)
+WRAPPER_IMPL_AVX _ZGVbN2v_acos
+END (_ZGVcN4v_acos)
diff --git a/sysdeps/x86_64/fpu/svml_d_acos8_core.S b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
new file mode 100644
index 0000000000..e26b30d81a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
@@ -0,0 +1,25 @@ 
+/* Function acos vectorized with AVX-512, wrapper to AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_acos)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_acos
+END (_ZGVeN8v_acos)
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf16_core.S b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
new file mode 100644
index 0000000000..70e046d492
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
@@ -0,0 +1,25 @@ 
+/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_acosf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf
+END (_ZGVeN16v_acosf)
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf4_core.S b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
new file mode 100644
index 0000000000..36354b32b5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
@@ -0,0 +1,29 @@ 
+/* Function acosf vectorized with SSE2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN4v_acosf)
+WRAPPER_IMPL_SSE2 acosf
+END (_ZGVbN4v_acosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_acosf)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
new file mode 100644
index 0000000000..f08864a511
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
@@ -0,0 +1,29 @@ 
+/* Function acosf vectorized with AVX2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN8v_acosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_acosf
+END (_ZGVdN8v_acosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_acosf)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
new file mode 100644
index 0000000000..f3ed4d8e78
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
@@ -0,0 +1,25 @@ 
+/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY (_ZGVcN8v_acosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_acosf
+END (_ZGVcN8v_acosf)
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
new file mode 100644
index 0000000000..4f74b4260a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
@@ -0,0 +1 @@ 
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
new file mode 100644
index 0000000000..4f74b4260a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
@@ -0,0 +1 @@ 
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
new file mode 100644
index 0000000000..4f74b4260a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
@@ -0,0 +1 @@ 
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
new file mode 100644
index 0000000000..e38b8ce821
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
@@ -0,0 +1,3 @@ 
+#define LIBMVEC_TYPE double
+#define LIBMVEC_FUNC acos
+#include "test-vector-abi-arg1.h"
diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
index ed932fc98d..0abc7d2021 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
 VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
 
 #define VEC_INT_TYPE __m128i
 
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
index 3a6e37044f..dda093b914 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@@ -30,6 +30,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
 VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
 
 #ifndef __ILP32__
 # define VEC_INT_TYPE __m256i
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
index 99db4e7616..f3230463bb 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
 VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
 
 #define VEC_INT_TYPE __m128i
 
diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
index 251d429ac0..cf9f52faf0 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
 VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
 
 #ifndef __ILP32__
 # define VEC_INT_TYPE __m512i
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
new file mode 100644
index 0000000000..1e6474dfa2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
@@ -0,0 +1 @@ 
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
new file mode 100644
index 0000000000..1e6474dfa2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
@@ -0,0 +1 @@ 
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
new file mode 100644
index 0000000000..1e6474dfa2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
@@ -0,0 +1 @@ 
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
new file mode 100644
index 0000000000..fb47f974fd
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
@@ -0,0 +1,3 @@ 
+#define LIBMVEC_TYPE float
+#define LIBMVEC_FUNC acosf
+#include "test-vector-abi-arg1.h"
diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
index c1d14cd79e..abbd3ed870 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
 VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
 
 #define VEC_INT_TYPE __m512i
 
diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
index d23c372060..8a24027952 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
 VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
 
 #define VEC_INT_TYPE __m128i
 
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
index 3152cffb0c..aff0442606 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@@ -30,6 +30,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
 VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
 
 /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf.  */
 #undef VECTOR_WRAPPER_fFF
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
index a8492abfef..913584d111 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
 VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
 
 #define VEC_INT_TYPE __m128i