[v1] x86: Optimize svml_s_atanhf_core_{ss4|avx2|avx512}.S

Message ID 20220201191050.169899-1-goldstein.w.n@gmail.com
State Accepted, archived
Headers
Series [v1] x86: Optimize svml_s_atanhf_core_{ss4|avx2|avx512}.S |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein Feb. 1, 2022, 7:10 p.m. UTC
  No bug.

Optimizations are:
    1. Reduce code size
        avx512: -58 bytes
        avx2:   -53 bytes
        sse4:   -54 bytes
    2. Reduce rodata size
        avx512: -128 bytes
        avx2:   -32 bytes
        sse4:   -16 bytes
    3. Remove register save/restores and stack adjustement from the
       fast path.
    4. Slightly improve instruction selection/scheduling where
       possible.
    5. Slightly improve register choices to remove redundant moves
       and/or use register that get smaller instruction
       encoding (avx2/sse4 only).

The result is ~7% speedup for avx2/sse4 and ~17% speedup for avx512.

Results from geomean of 40 benchtest runs:

        Function, New Time, Old Time, New / Old
 _ZGVbN4v_atanhf,   22.492,   24.143,     0.932
 _ZGVcN8v_atanhf,   23.606,   25.231,     0.936
 _ZGVdN8v_atanhf,   15.768,   16.841,     0.936
_ZGVeN16v_atanhf,   11.434,   13.816,     0.828

All math and mathvec tests are passing.
---
 .../multiarch/svml_s_atanhf16_core_avx512.S   | 662 +++++++++---------
 .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 576 +++++++--------
 .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 545 +++++++-------
 3 files changed, 834 insertions(+), 949 deletions(-)
  

Comments

Noah Goldstein Feb. 1, 2022, 9:50 p.m. UTC | #1
On Tue, Feb 1, 2022 at 1:10 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> Optimizations are:
>     1. Reduce code size
>         avx512: -58 bytes
>         avx2:   -53 bytes
>         sse4:   -54 bytes
>     2. Reduce rodata size
>         avx512: -128 bytes
>         avx2:   -32 bytes
>         sse4:   -16 bytes
>     3. Remove register save/restores and stack adjustement from the
>        fast path.
>     4. Slightly improve instruction selection/scheduling where
>        possible.
>     5. Slightly improve register choices to remove redundant moves
>        and/or use register that get smaller instruction
>        encoding (avx2/sse4 only).
>
> The result is ~7% speedup for avx2/sse4 and ~17% speedup for avx512.
>
> Results from geomean of 40 benchtest runs:
>
>         Function, New Time, Old Time, New / Old
>  _ZGVbN4v_atanhf,   22.492,   24.143,     0.932
>  _ZGVcN8v_atanhf,   23.606,   25.231,     0.936
>  _ZGVdN8v_atanhf,   15.768,   16.841,     0.936
> _ZGVeN16v_atanhf,   11.434,   13.816,     0.828

Note, the avx512 version increase ULP from 1.4 -> 2.4. That appears to be
within the documented range from:

commit 3e63b15d43ea6f61effcf92324e47e981bd7d0a8
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date:   Tue Jan 18 07:07:44 2022 -0800

    x86_64: Document libmvec vector functions accuracy [BZ #28766]

    Document maximum 4 ulps accuracy for x86_64 libmvec functions.
    This fixes BZ #28766.

and reducing the precision here gets about 10% of the 17% speedup.

Is this an acceptable tradeoff?

>
> All math and mathvec tests are passing.
> ---
>  .../multiarch/svml_s_atanhf16_core_avx512.S   | 662 +++++++++---------
>  .../fpu/multiarch/svml_s_atanhf4_core_sse4.S  | 576 +++++++--------
>  .../fpu/multiarch/svml_s_atanhf8_core_avx2.S  | 545 +++++++-------
>  3 files changed, 834 insertions(+), 949 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> index f863f4f959..fbd84b2c8e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> @@ -31,363 +31,343 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal_avx512
> - */
> -#define Log_tbl_H                      0
> -#define Log_tbl_L                      128
> -#define One                            256
> -#define AbsMask                        320
> -#define AddB5                          384
> -#define RcpBitMask                     448
> -#define poly_coeff3                    512
> -#define poly_coeff2                    576
> -#define poly_coeff1                    640
> -#define poly_coeff0                    704
> -#define Half                           768
> -#define L2H                            832
> -#define L2L                            896
> +
> +    /* Offsets for data table __svml_satanh_data_internal_avx512.  */
> +#define AbsMask        0
> +#define One    64
> +#define AddB5  128
> +#define RcpBitMask     192
> +#define Log_tbl_L_lo   256
> +#define Log_tbl_L_hi   320
> +#define Log_tbl_H_lo   384
> +#define Log_tbl_H_hi   448
> +#define L2H    512
> +#define L2L    576
> +#define poly_coeff3    640
> +#define poly_coeff2    704
> +#define poly_coeff1    768
>
>  #include <sysdep.h>
> +#define TANHF_DATA(x)  (x)     +       __svml_satanh_data_internal_avx512
>
> -        .text
> -       .section .text.exex512,"ax",@progbits
> +       .text
> +       .section .text.exex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_atanhf_skx)
> -        pushq     %rbp
> -        cfi_def_cfa_offset(16)
> -        movq      %rsp, %rbp
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -        andq      $-64, %rsp
> -        subq      $192, %rsp
> -        vmovups   One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> -
> -/* round reciprocals to 1+5b mantissas */
> -        vmovups   AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> -        vmovups   RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> -        vmovaps   %zmm0, %zmm11
> -        vandps    AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> -
> -/* 1+y */
> -        vaddps    {rn-sae}, %zmm4, %zmm6, %zmm9
> -
> -/* 1-y */
> -        vsubps    {rn-sae}, %zmm6, %zmm4, %zmm8
> -        vxorps    %zmm6, %zmm11, %zmm10
> -
> -/* Yp_high */
> -        vsubps    {rn-sae}, %zmm4, %zmm9, %zmm2
> -
> -/* -Ym_high */
> -        vsubps    {rn-sae}, %zmm4, %zmm8, %zmm5
> -
> -/* RcpP ~ 1/Yp */
> -        vrcp14ps  %zmm9, %zmm12
> -
> -/* RcpM ~ 1/Ym */
> -        vrcp14ps  %zmm8, %zmm13
> -
> -/* input outside (-1, 1) ? */
> -        vcmpps    $21, {sae}, %zmm4, %zmm6, %k0
> -        vpaddd    %zmm14, %zmm12, %zmm15
> -        vpaddd    %zmm14, %zmm13, %zmm0
> -
> -/* Yp_low */
> -        vsubps    {rn-sae}, %zmm2, %zmm6, %zmm3
> -        vandps    %zmm1, %zmm15, %zmm7
> -        vandps    %zmm1, %zmm0, %zmm12
> -
> -/* Ym_low */
> -        vaddps    {rn-sae}, %zmm5, %zmm6, %zmm5
> -
> -/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
> -        vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> -
> -/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> -        vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> -        vmovups   Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> -        vmovups   Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> -
> -/* exponents */
> -        vgetexpps {sae}, %zmm7, %zmm15
> -        vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> -
> -/* Table lookups */
> -        vmovups   __svml_satanh_data_internal_avx512(%rip), %zmm6
> -        vgetexpps {sae}, %zmm12, %zmm14
> -        vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> -
> -/* Prepare table index */
> -        vpsrld    $18, %zmm7, %zmm3
> -        vpsrld    $18, %zmm12, %zmm2
> -        vmovups   Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -        vmovups   poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> -
> -/* Km-Kp */
> -        vsubps    {rn-sae}, %zmm15, %zmm14, %zmm1
> -        kmovw     %k0, %edx
> -        vmovaps   %zmm3, %zmm0
> -        vpermi2ps %zmm13, %zmm8, %zmm3
> -        vpermt2ps %zmm13, %zmm2, %zmm8
> -        vpermi2ps %zmm7, %zmm6, %zmm0
> -        vpermt2ps %zmm7, %zmm2, %zmm6
> -        vsubps    {rn-sae}, %zmm3, %zmm8, %zmm5
> -
> -/* K*L2H + Th */
> -        vmovups   L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> -
> -/* K*L2L + Tl */
> -        vmovups   L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -
> -/* polynomials */
> -        vmovups   poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -        vmovups   poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> -
> -/* table values */
> -        vsubps    {rn-sae}, %zmm0, %zmm6, %zmm0
> -        vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> -        vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> -        vmovups   poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -        vmovaps   %zmm3, %zmm2
> -        vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> -        vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> -        vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> -        vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> -        vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> -        vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> -
> -/* (K*L2L + Tl) + Rp*PolyP */
> -        vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> -        vorps     Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> -
> -/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> -        vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> -        vaddps    {rn-sae}, %zmm3, %zmm0, %zmm4
> -        vmulps    {rn-sae}, %zmm9, %zmm4, %zmm0
> -        testl     %edx, %edx
> -
> -/* Go to special inputs processing branch */
> -        jne       L(SPECIAL_VALUES_BRANCH)
> -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> -
> -/* Restore registers
> - * and exit the function
> - */
> +       vandps  TANHF_DATA(AbsMask)(%rip), %zmm0, %zmm6
> +       vmovups TANHF_DATA(One)(%rip), %zmm4
>
> -L(EXIT):
> -        movq      %rbp, %rsp
> -        popq      %rbp
> -        cfi_def_cfa(7, 8)
> -        cfi_restore(6)
> -        ret
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -
> -/* Branch to process
> - * special inputs
> - */
> +       /* 1+y.  */
> +       vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
>
> -L(SPECIAL_VALUES_BRANCH):
> -        vmovups   %zmm11, 64(%rsp)
> -        vmovups   %zmm0, 128(%rsp)
> -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> -
> -        xorl      %eax, %eax
> -                                # LOE rbx r12 r13 r14 r15 eax edx
> -
> -        vzeroupper
> -        movq      %r12, 16(%rsp)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -        movl      %eax, %r12d
> -        movq      %r13, 8(%rsp)
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -        movl      %edx, %r13d
> -        movq      %r14, (%rsp)
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> +       /* 1-y.  */
> +       vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
>
> -L(RANGEMASK_CHECK):
> -        btl       %r12d, %r13d
> +       /* round reciprocals to 1+5b mantissas.  */
> +       vmovups TANHF_DATA(AddB5)(%rip), %zmm14
> +       vmovups TANHF_DATA(RcpBitMask)(%rip), %zmm1
>
> -/* Call scalar math function */
> -        jc        L(SCALAR_MATH_CALL)
> -                                # LOE rbx r15 r12d r13d
> +       /* RcpP ~ 1/Yp.  */
> +       vrcp14ps %zmm9, %zmm12
>
> -/* Special inputs
> - * processing loop
> - */
> +       /* RcpM ~ 1/Ym.  */
> +       vrcp14ps %zmm8, %zmm13
>
> -L(SPECIAL_VALUES_LOOP):
> -        incl      %r12d
> -        cmpl      $16, %r12d
> -
> -/* Check bits in range mask */
> -        jl        L(RANGEMASK_CHECK)
> -                                # LOE rbx r15 r12d r13d
> -
> -        movq      16(%rsp), %r12
> -        cfi_restore(12)
> -        movq      8(%rsp), %r13
> -        cfi_restore(13)
> -        movq      (%rsp), %r14
> -        cfi_restore(14)
> -        vmovups   128(%rsp), %zmm0
> -
> -/* Go to exit */
> -        jmp       L(EXIT)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r12 r13 r14 r15 zmm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> +       /* Yp_high.  */
> +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> +
> +       /* -Ym_high.  */
> +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> +
> +
> +       /* input outside (-1, 1) ?.  */
> +       vpaddd  %zmm14, %zmm12, %zmm15
> +       vpaddd  %zmm14, %zmm13, %zmm12
> +
> +       /* Yp_low.  */
> +       vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
> +       vandps  %zmm1, %zmm15, %zmm7
> +       vandps  %zmm1, %zmm12, %zmm12
> +
> +       /* Ym_low.  */
> +       vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> +
> +       /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low.  */
> +       vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> +
> +       /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low.  */
> +       vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
>
> -L(SCALAR_MATH_CALL):
> -        movl      %r12d, %r14d
> -        movss     64(%rsp,%r14,4), %xmm0
> -        call      atanhf@PLT
> -                                # LOE rbx r14 r15 r12d r13d xmm0
> +       vmovups TANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
> +       vmovups TANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
>
> -        movss     %xmm0, 128(%rsp,%r14,4)
> +       /* exponents.  */
> +       vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> +       vgetexpps {sae}, %zmm7, %zmm15
>
> -/* Process special inputs in loop */
> -        jmp       L(SPECIAL_VALUES_LOOP)
> -                                # LOE rbx r15 r12d r13d
> +
> +       /* Table lookups.  */
> +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
> +       vgetexpps {sae}, %zmm12, %zmm14
> +
> +
> +       /* Prepare table index.  */
> +       vpsrld  $18, %zmm7, %zmm3
> +       vpsrld  $18, %zmm12, %zmm2
> +       vmovups TANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
> +       vmovups TANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
> +       /* Km-Kp.  */
> +
> +       vmovaps %zmm3, %zmm5
> +       vpermi2ps %zmm13, %zmm10, %zmm3
> +       vpermt2ps %zmm13, %zmm2, %zmm10
> +       vpermi2ps %zmm7, %zmm11, %zmm5
> +       vpermt2ps %zmm7, %zmm2, %zmm11
> +       vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> +       vsubps  {rn-sae}, %zmm3, %zmm10, %zmm7
> +
> +       /* K*L2H + Th.  */
> +       vmovups TANHF_DATA(L2H)(%rip), %zmm2
> +
> +       /* K*L2L + Tl.  */
> +       vmovups TANHF_DATA(L2L)(%rip), %zmm3
> +
> +       /* table values.  */
> +       vsubps  {rn-sae}, %zmm5, %zmm11, %zmm5
> +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
> +       vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
> +       /* polynomials.  */
> +       vmovups TANHF_DATA(poly_coeff3)(%rip), %zmm7
> +       vmovups TANHF_DATA(poly_coeff2)(%rip), %zmm10
> +       vmovaps %zmm10, %zmm14
> +       // vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
> +       vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
> +       vmovups TANHF_DATA(poly_coeff1)(%rip), %zmm12
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
> +       vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
> +       vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
> +
> +       /* (K*L2L + Tl) + Rp*PolyP.  */
> +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
> +
> +       vandps  %zmm12, %zmm4, %zmm12
> +       vpternlogq $246, %zmm0, %zmm6, %zmm12
> +
> +       /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM.  */
> +       vfnmadd213ps {rn-sae}, %zmm10, %zmm8, %zmm14
> +       vaddps  {rn-sae}, %zmm14, %zmm5, %zmm8
> +
> +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> +       kmovw   %k0, %edx
> +       testl   %edx, %edx
> +
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm0
> +
> +       ret
> +
> +       /* Branch to process special inputs.  */
> +L(SPECIAL_VALUES_BRANCH):
> +       pushq   %rbp
> +       /* Need to callee save registers to preserve state across tanhf calls.
> +        */
> +       pushq   %r13
> +       pushq   %r12
> +       movq    %rsp, %rbp
> +
> +       /* Align stack and make room for 2x zmm vectors.  */
> +       andq    $-64, %rsp
> +       addq    $-128, %rsp
> +       vmulps  {rn-sae}, %zmm12, %zmm8, %zmm1
> +       vmovaps %zmm1, (%rsp)
> +       vmovaps %zmm0, 64(%rsp)
> +
> +       vzeroupper
> +
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %r13d
> +L(SPECIAL_VALUES_LOOP):
> +       /* use r12 as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %r12d, %r12d
> +       tzcntl  %r13d, %r12d
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   64(%rsp, %r12, 4), %xmm0
> +       call    atanhf@PLT
> +
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %r12, 4)
> +
> +       blsr    %r13d, %r13d
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +
> +       /* All results have been written to 64(%rsp).  */
> +       vmovaps (%rsp), %zmm0
> +       /* Restore rsp.  */
> +       movq    %rbp, %rsp
> +       /* Restore callee save registers.  */
> +       popq    %r12
> +       popq    %r13
> +       popq    %rbp
> +       ret
>  END(_ZGVeN16v_atanhf_skx)
>
> -        .section .rodata, "a"
> -        .align 64
> +       .section .rodata, "a"
> +       .align  64
>
>  #ifdef __svml_satanh_data_internal_avx512_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -        __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> -        __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> -        __declspec(align(64)) VUINT32 One[16][1];
> -        __declspec(align(64)) VUINT32 AbsMask[16][1];
> -        __declspec(align(64)) VUINT32 AddB5[16][1];
> -        __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> -        __declspec(align(64)) VUINT32 poly_coeff3[16][1];
> -        __declspec(align(64)) VUINT32 poly_coeff2[16][1];
> -        __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> -        __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> -        __declspec(align(64)) VUINT32 Half[16][1];
> -        __declspec(align(64)) VUINT32 L2H[16][1];
> -        __declspec(align(64)) VUINT32 L2L[16][1];
> -    } __svml_satanh_data_internal_avx512;
> +       typedef unsigned int VUINT32;
> +       typedef struct{
> +       __declspec (align(64))VUINT32 AbsMask[16][1];
> +       __declspec (align(64))VUINT32 One[16][1];
> +       __declspec (align(64))VUINT32 AddB5[16][1];
> +       __declspec (align(64))VUINT32 RcpBitMask[16][1];
> +       __declspec (align(64))VUINT32 Log_tbl_L_lo[16][1];
> +       __declspec (align(64))VUINT32 Log_tbl_L_hi[16][1];
> +       __declspec (align(64))VUINT32 Log_tbl_H_lo[16][1];
> +       __declspec (align(64))VUINT32 Log_tbl_H_hi[16][1];
> +       __declspec (align(64))VUINT32 L2H[16][1];
> +       __declspec (align(64))VUINT32 L2L[16][1];
> +       __declspec (align(64))VUINT32 poly_coeff3[16][1];
> +       __declspec (align(64))VUINT32 poly_coeff2[16][1];
> +       __declspec (align(64))VUINT32 poly_coeff1[16][1];
> +       }__svml_satanh_data_internal_avx512;
>  #endif
>  __svml_satanh_data_internal_avx512:
> -        /*== Log_tbl_H ==*/
> -        .long 0x00000000
> -        .long 0x3cfc0000
> -        .long 0x3d780000
> -        .long 0x3db78000
> -        .long 0x3df10000
> -        .long 0x3e14c000
> -        .long 0x3e300000
> -        .long 0x3e4a8000
> -        .long 0x3e648000
> -        .long 0x3e7dc000
> -        .long 0x3e8b4000
> -        .long 0x3e974000
> -        .long 0x3ea30000
> -        .long 0x3eae8000
> -        .long 0x3eb9c000
> -        .long 0x3ec4e000
> -        .long 0x3ecfa000
> -        .long 0x3eda2000
> -        .long 0x3ee48000
> -        .long 0x3eeea000
> -        .long 0x3ef8a000
> -        .long 0x3f013000
> -        .long 0x3f05f000
> -        .long 0x3f0aa000
> -        .long 0x3f0f4000
> -        .long 0x3f13d000
> -        .long 0x3f184000
> -        .long 0x3f1ca000
> -        .long 0x3f20f000
> -        .long 0x3f252000
> -        .long 0x3f295000
> -        .long 0x3f2d7000
> -        /*== Log_tbl_L ==*/
> -        .align 64
> -        .long 0x00000000
> -        .long 0x3726c39e
> -        .long 0x38a30c01
> -        .long 0x37528ae5
> -        .long 0x38e0edc5
> -        .long 0xb8ab41f8
> -        .long 0xb7cf8f58
> -        .long 0x3896a73d
> -        .long 0xb5838656
> -        .long 0x380c36af
> -        .long 0xb8235454
> -        .long 0x3862bae1
> -        .long 0x38c5e10e
> -        .long 0x38dedfac
> -        .long 0x38ebfb5e
> -        .long 0xb8e63c9f
> -        .long 0xb85c1340
> -        .long 0x38777bcd
> -        .long 0xb6038656
> -        .long 0x37d40984
> -        .long 0xb8b85028
> -        .long 0xb8ad5a5a
> -        .long 0x3865c84a
> -        .long 0x38c3d2f5
> -        .long 0x383ebce1
> -        .long 0xb8a1ed76
> -        .long 0xb7a332c4
> -        .long 0xb779654f
> -        .long 0xb8602f73
> -        .long 0x38f85db0
> -        .long 0x37b4996f
> -        .long 0xb8bfb3ca
> -        /*== One ==*/
> -        .align 64
> -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -        /*== AbsMask ==*/
> -        .align 64
> -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> -        /*== AddB5 ==*/
> -        .align 64
> -        .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> -        /*== RcpBitMask ==*/
> -        .align 64
> -        .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> -        /*== poly_coeff3 ==*/
> -        .align 64
> -        .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> -        /*== poly_coeff2 ==*/
> -        .align 64
> -        .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> -        /*== poly_coeff1 ==*/
> -        .align 64
> -        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> -        /*== poly_coeff0 ==*/
> -        .align 64
> -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -        /*== Half ==*/
> -        .align 64
> -        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> -        /*== L2H = log(2)_high ==*/
> -        .align 64
> -        .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> -        /*== L2L = log(2)_low ==*/
> -        .align 64
> -        .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> -        .align 64
> -        .type  __svml_satanh_data_internal_avx512,@object
> -        .size  __svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
> +       /* AbsMask.  */
> +       .align  64
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       /* One.  */
> +       .align  64
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* AddB5.  */
> +       .align  64
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       /* RcpBitMask.  */
> +       .align  64
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       /* Log_tbl_L_lo.  */
> +       .align  64
> +       .long   0x00000000
> +       .long   0x3726c39e
> +       .long   0x38a30c01
> +       .long   0x37528ae5
> +       .long   0x38e0edc5
> +       .long   0xb8ab41f8
> +       .long   0xb7cf8f58
> +       .long   0x3896a73d
> +       .long   0xb5838656
> +       .long   0x380c36af
> +       .long   0xb8235454
> +       .long   0x3862bae1
> +       .long   0x38c5e10e
> +       .long   0x38dedfac
> +       .long   0x38ebfb5e
> +       .long   0xb8e63c9f
> +       /* Log_tbl_L_hi.  */
> +       .align  64
> +       .long   0xb85c1340
> +       .long   0x38777bcd
> +       .long   0xb6038656
> +       .long   0x37d40984
> +       .long   0xb8b85028
> +       .long   0xb8ad5a5a
> +       .long   0x3865c84a
> +       .long   0x38c3d2f5
> +       .long   0x383ebce1
> +       .long   0xb8a1ed76
> +       .long   0xb7a332c4
> +       .long   0xb779654f
> +       .long   0xb8602f73
> +       .long   0x38f85db0
> +       .long   0x37b4996f
> +       .long   0xb8bfb3ca
> +       /* Log_tbl_H_lo.  */
> +       .align  64
> +       .long   0x00000000
> +       .long   0x3cfc0000
> +       .long   0x3d780000
> +       .long   0x3db78000
> +       .long   0x3df10000
> +       .long   0x3e14c000
> +       .long   0x3e300000
> +       .long   0x3e4a8000
> +       .long   0x3e648000
> +       .long   0x3e7dc000
> +       .long   0x3e8b4000
> +       .long   0x3e974000
> +       .long   0x3ea30000
> +       .long   0x3eae8000
> +       .long   0x3eb9c000
> +       .long   0x3ec4e000
> +       /* Log_tbl_H_hi.  */
> +       .align  64
> +       .long   0x3ecfa000
> +       .long   0x3eda2000
> +       .long   0x3ee48000
> +       .long   0x3eeea000
> +       .long   0x3ef8a000
> +       .long   0x3f013000
> +       .long   0x3f05f000
> +       .long   0x3f0aa000
> +       .long   0x3f0f4000
> +       .long   0x3f13d000
> +       .long   0x3f184000
> +       .long   0x3f1ca000
> +       .long   0x3f20f000
> +       .long   0x3f252000
> +       .long   0x3f295000
> +       .long   0x3f2d7000
> +       /* L2H = log(2)_high.  */
> +       .align  64
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       /* L2L = log(2)_low.  */
> +       .align  64
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       /* poly_coeff3.  */
> +       .align  64
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       /* poly_coeff2.  */
> +       .align  64
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       /* poly_coeff1.  */
> +       .align  64
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .align  64
> +       .type   __svml_satanh_data_internal_avx512, @object
> +       .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> index 7927e01f0c..e1a8a28a3d 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> @@ -28,334 +28,278 @@
>   *   atanh(-1) = -INF
>   *   atanh(x)  = NaN if |x| > 1, or if x is a NaN or INF
>   *
> - */
> -
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> -#define SgnMask                        0
> -#define sOne                           16
> -#define sPoly                          32
> -#define iBrkValue                      160
> -#define iOffExpoMask                   176
> -#define sHalf                          192
> -#define sSign                          208
> -#define sTopMask12                     224
> -#define TinyRange                      240
> -#define sLn2                           256
> +*/
>
> -#include <sysdep.h>
>
> -        .text
> -       .section .text.sse4,"ax",@progbits
> -ENTRY(_ZGVbN4v_atanhf_sse4)
> -        subq      $72, %rsp
> -        cfi_def_cfa_offset(80)
> -        movaps    %xmm0, %xmm5
> -
> -/* Load constants including One = 1 */
> -        movups    sOne+__svml_satanh_data_internal(%rip), %xmm4
> -        movaps    %xmm5, %xmm3
> -
> -/* Strip off the sign, so treat X as positive until right at the end */
> -        movups    SgnMask+__svml_satanh_data_internal(%rip), %xmm7
> -        movaps    %xmm4, %xmm8
> -        andps     %xmm5, %xmm7
> -        movaps    %xmm4, %xmm10
> -        movups    sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
> -        movaps    %xmm4, %xmm14
> -        movaps    %xmm11, %xmm9
> +       /* Offsets for data table __svml_satanh_data_internal.  */
> +#define sOne   0
> +#define SgnMask        16
> +#define sTopMask12     32
> +#define iBrkValue      48
> +#define iOffExpoMask   64
> +#define sPoly  80
> +#define sLn2   208
> +#define TinyRange      224
>
> -/*
> - * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> - * the upper part UHi being <= 12 bits long. Then we have
> - * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> - */
> -        movaps    %xmm7, %xmm12
> -
> -/*
> - * Check whether |X| < 1, in which case we use the main function.
> - * Otherwise set the rangemask so that the callout will get used.
> - * Note that this will also use the callout for NaNs since not(NaN < 1).
> - */
> -        movaps    %xmm7, %xmm6
> -        movaps    %xmm7, %xmm2
> -        cmpnltps  %xmm4, %xmm6
> -        cmpltps   TinyRange+__svml_satanh_data_internal(%rip), %xmm2
> -        mulps     %xmm5, %xmm3
> -        subps     %xmm7, %xmm8
> -        addps     %xmm7, %xmm12
> -        movmskps  %xmm6, %edx
> -        subps     %xmm8, %xmm10
> -        addps     %xmm5, %xmm3
> -        subps     %xmm7, %xmm10
> -        andps     %xmm8, %xmm9
> -
> -/*
> - * Now we feed into the log1p code, using H in place of _VARG1 and
> - * later incorporating L into the reduced argument.
> - * compute 1+x as high, low parts
> - */
> -        movaps    %xmm4, %xmm7
> -
> -/*
> - * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
> - * The first FMR is exact (we force R to 12 bits just in case it
> - * isn't already, to make absolutely sure), and since E is ~ 2^-12,
> - * the rounding error in the other one is acceptable.
> - */
> -        rcpps     %xmm9, %xmm15
> -        subps     %xmm9, %xmm8
> -        andps     %xmm11, %xmm15
>
> -/*
> - * Split V as well into upper 12 bits and lower part, so that we can get
> - * a preliminary quotient estimate without rounding error.
> - */
> -        andps     %xmm12, %xmm11
> -        mulps     %xmm15, %xmm9
> -        addps     %xmm8, %xmm10
> -        subps     %xmm11, %xmm12
> -
> -/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> -        mulps     %xmm15, %xmm11
> -        mulps     %xmm15, %xmm10
> -        subps     %xmm9, %xmm14
> -        mulps     %xmm12, %xmm15
> -        subps     %xmm10, %xmm14
> -
> -/* Compute D = E + E^2 */
> -        movaps    %xmm14, %xmm13
> -        movaps    %xmm4, %xmm8
> -        mulps     %xmm14, %xmm13
> -
> -/* reduction: compute r,n */
> -        movdqu    iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
> -        addps     %xmm13, %xmm14
> +#include <sysdep.h>
> +#define TANHF_DATA(x)  (x)     +       __svml_satanh_data_internal
>
> -/*
> - * Compute R * (VHi + VLo) * (1 + E + E^2)
> - * = R *  (VHi + VLo) * (1 + D)
> - * = QHi + (QHi * D + QLo + QLo * D)
> - */
> -        movaps    %xmm14, %xmm0
> -        mulps     %xmm15, %xmm14
> -        mulps     %xmm11, %xmm0
> -        addps     %xmm14, %xmm15
> -        movdqu    iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
> -        movaps    %xmm4, %xmm14
> -
> -/* Record the sign for eventual reincorporation. */
> -        movups    sSign+__svml_satanh_data_internal(%rip), %xmm1
> -        addps     %xmm15, %xmm0
> +       .text
> +       .section .text.sse4, "ax", @progbits
> +ENTRY(_ZGVbN4v_atanhf_sse4)
> +       movaps  %xmm0, %xmm5
> +
> +       /* Load constants including One = 1.  */
> +       movups  TANHF_DATA(sOne)(%rip), %xmm4
> +       movaps  %xmm5, %xmm3
> +
> +       /* Strip off the sign, so treat X as positive until right at the end.
> +        */
> +       movups  TANHF_DATA(SgnMask)(%rip), %xmm1
> +       movaps  %xmm4, %xmm2
> +       andps   %xmm1, %xmm0
> +       movaps  %xmm4, %xmm10
> +       movups  TANHF_DATA(sTopMask12)(%rip), %xmm11
> +       movaps  %xmm4, %xmm14
> +       movaps  %xmm11, %xmm9
> +
> +
> +       /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> +          the upper part UHi being <= 12 bits long. Then we have atanh(X) = 1/2 *
> +          log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).  */
> +       movaps  %xmm0, %xmm6
> +       mulps   %xmm5, %xmm3
> +       subps   %xmm0, %xmm2
> +       addps   %xmm0, %xmm6
> +       subps   %xmm2, %xmm10
> +       addps   %xmm5, %xmm3
> +       subps   %xmm0, %xmm10
> +       andps   %xmm2, %xmm9
> +
> +
> +       /* Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E The first
> +          FMR is exact (we force R to 12 bits just in case it isn't already, to
> +          make absolutely sure), and since E is ~ 2^-12, the rounding error in the
> +          other one is acceptable.  */
> +       rcpps   %xmm9, %xmm7
> +       subps   %xmm9, %xmm2
> +       andps   %xmm11, %xmm7
> +
> +
> +       /* Split V as well into upper 12 bits and lower part, so that we can get
> +          a preliminary quotient estimate without rounding error.  */
> +
> +       andps   %xmm6, %xmm11
> +       mulps   %xmm7, %xmm9
> +       addps   %xmm2, %xmm10
> +       subps   %xmm11, %xmm6
> +
> +       /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.
> +        */
> +       mulps   %xmm7, %xmm11
> +       mulps   %xmm7, %xmm10
> +       subps   %xmm9, %xmm14
> +       mulps   %xmm6, %xmm7
> +       subps   %xmm10, %xmm14
> +
> +       /* Compute D = E + E^2.  */
> +       movaps  %xmm14, %xmm13
> +       movaps  %xmm4, %xmm8
> +       mulps   %xmm14, %xmm13
> +
> +       /* reduction: compute r,n.  */
> +       movdqu  TANHF_DATA(iBrkValue)(%rip), %xmm9
> +       addps   %xmm13, %xmm14
> +
> +       /*
> +        * Compute R * (VHi + VLo) * (1 + E + E^2)
> +        * = R *  (VHi + VLo) * (1 + D)
> +        * = QHi + (QHi * D + QLo + QLo * D)
> +        */
> +       movaps  %xmm14, %xmm2
> +       mulps   %xmm7, %xmm14
> +       mulps   %xmm11, %xmm2
> +       addps   %xmm14, %xmm7
> +       movdqu  TANHF_DATA(iOffExpoMask)(%rip), %xmm12
> +       movaps  %xmm4, %xmm14
> +
> +       /* Record the sign for eventual reincorporation.  */
> +       addps   %xmm7, %xmm2
> +
> +
> +       /* Now finally accumulate the high and low parts of the argument to
> +          log1p, H + L, with a final compensated summation.  */
> +       movaps  %xmm2, %xmm6
> +       andnps  %xmm5, %xmm1
> +       movaps  %xmm4, %xmm7
> +       /* Or the sign bit in with the tiny result to handle atanh(-0)
> +          correctly.  */
> +       addps   %xmm11, %xmm6
> +       maxps   %xmm6, %xmm7
> +       minps   %xmm6, %xmm8
> +       subps   %xmm6, %xmm11
> +       movaps  %xmm7, %xmm10
> +       addps   %xmm8, %xmm10
> +       addps   %xmm11, %xmm2
> +       subps   %xmm10, %xmm7
> +       psubd   %xmm9, %xmm10
> +       addps   %xmm8, %xmm7
> +       pand    %xmm10, %xmm12
> +       psrad   $23, %xmm10
> +       cvtdq2ps %xmm10, %xmm13
> +       addps   %xmm7, %xmm2
> +
> +       /* final reconstruction.  */
> +       pslld   $23, %xmm10
> +       paddd   %xmm9, %xmm12
> +       psubd   %xmm10, %xmm14
> +
> +       /* polynomial evaluation.  */
> +       subps   %xmm4, %xmm12
> +       mulps   %xmm14, %xmm2
> +       movups  TANHF_DATA(sPoly)(%rip), %xmm7
> +       addps   %xmm12, %xmm2
> +       mulps   %xmm2, %xmm7
> +
> +
> +       /* Finally, halve the result and reincorporate the sign.  */
> +       addps   TANHF_DATA(sPoly + 16)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   TANHF_DATA(sPoly + 32)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   TANHF_DATA(sPoly + 48)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   TANHF_DATA(sPoly + 64)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   TANHF_DATA(sPoly + 80)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       addps   TANHF_DATA(sPoly + 96)(%rip), %xmm7
> +       mulps   %xmm2, %xmm7
> +       movaps  TANHF_DATA(sPoly + 112)(%rip), %xmm6
> +       addps   %xmm6, %xmm7
> +       mulps   %xmm2, %xmm7
> +       mulps   %xmm2, %xmm7
> +       mulps   TANHF_DATA(sLn2)(%rip), %xmm13
> +       /* We can build `sHalf` with `sPoly & sOne`.  */
> +       andps   %xmm4, %xmm6
> +       orps    %xmm1, %xmm3
> +       xorps   %xmm6, %xmm1
> +
> +       addps   %xmm2, %xmm7
> +       addps   %xmm13, %xmm7
> +       mulps   %xmm7, %xmm1
> +
> +       /* Check whether |X| < 1, in which case we use the main function.
> +          Otherwise set the rangemask so that the callout will get used. Note that
> +          this will also use the callout for NaNs since not(NaN < 1).  */
> +       cmpleps %xmm0, %xmm4
> +       movmskps %xmm4, %edx
> +       cmpltps TANHF_DATA(TinyRange)(%rip), %xmm0
> +
> +       andps   %xmm0, %xmm3
> +       andnps  %xmm1, %xmm0
> +       orps    %xmm3, %xmm0
> +
> +       testl   %edx, %edx
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +
> +       /* No registers to restore on fast path.  */
> +       ret
> +
> +       /* Branch to process special inputs.  */
> +L(SPECIAL_VALUES_BRANCH):
> +       subq    $56, %rsp
>
> -/*
> - * Now finally accumulate the high and low parts of the
> - * argument to log1p, H + L, with a final compensated summation.
> - */
> -        movaps    %xmm0, %xmm6
> -        andps     %xmm5, %xmm1
> -
> -/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> -        orps      %xmm1, %xmm3
> -        addps     %xmm11, %xmm6
> -        maxps     %xmm6, %xmm7
> -        minps     %xmm6, %xmm8
> -        subps     %xmm6, %xmm11
> -        movaps    %xmm7, %xmm10
> -        andps     %xmm2, %xmm3
> -        addps     %xmm8, %xmm10
> -        addps     %xmm11, %xmm0
> -        subps     %xmm10, %xmm7
> -        psubd     %xmm9, %xmm10
> -        addps     %xmm7, %xmm8
> -        pand      %xmm10, %xmm12
> -        psrad     $23, %xmm10
> -        cvtdq2ps  %xmm10, %xmm13
> -        addps     %xmm8, %xmm0
> -
> -/* final reconstruction */
> -        mulps     sLn2+__svml_satanh_data_internal(%rip), %xmm13
> -        pslld     $23, %xmm10
> -        paddd     %xmm9, %xmm12
> -        psubd     %xmm10, %xmm14
> -
> -/* polynomial evaluation */
> -        subps     %xmm4, %xmm12
> -        mulps     %xmm0, %xmm14
> -        movups    sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
> -        addps     %xmm12, %xmm14
> -        mulps     %xmm14, %xmm0
> -
> -/* Finally, halve the result and reincorporate the sign */
> -        movups    sHalf+__svml_satanh_data_internal(%rip), %xmm4
> -        pxor      %xmm1, %xmm4
> -        addps     sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
> -        mulps     %xmm14, %xmm0
> -        addps     sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
> -        mulps     %xmm14, %xmm0
> -        addps     sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
> -        mulps     %xmm14, %xmm0
> -        addps     sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
> -        mulps     %xmm14, %xmm0
> -        addps     sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
> -        mulps     %xmm14, %xmm0
> -        addps     sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
> -        mulps     %xmm14, %xmm0
> -        addps     sPoly+__svml_satanh_data_internal(%rip), %xmm0
> -        mulps     %xmm14, %xmm0
> -        mulps     %xmm14, %xmm0
> -        addps     %xmm0, %xmm14
> -        movaps    %xmm2, %xmm0
> -        addps     %xmm13, %xmm14
> -        mulps     %xmm14, %xmm4
> -        andnps    %xmm4, %xmm0
> -        orps      %xmm3, %xmm0
> -        testl     %edx, %edx
> -
> -/* Go to special inputs processing branch */
> -        jne       L(SPECIAL_VALUES_BRANCH)
> -                                # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
> -
> -/* Restore registers
> - * and exit the function
> - */
> -
> -L(EXIT):
> -        addq      $72, %rsp
> -        cfi_def_cfa_offset(8)
> -        ret
> -        cfi_def_cfa_offset(80)
> -
> -/* Branch to process
> - * special inputs
> - */
> +       movups  %xmm5, (%rsp)
> +       movups  %xmm0, 16(%rsp)
>
> -L(SPECIAL_VALUES_BRANCH):
> -        movups    %xmm5, 32(%rsp)
> -        movups    %xmm0, 48(%rsp)
> -                                # LOE rbx rbp r12 r13 r14 r15 edx
> -
> -        xorl      %eax, %eax
> -        movq      %r12, 16(%rsp)
> -        cfi_offset(12, -64)
> -        movl      %eax, %r12d
> -        movq      %r13, 8(%rsp)
> -        cfi_offset(13, -72)
> -        movl      %edx, %r13d
> -        movq      %r14, (%rsp)
> -        cfi_offset(14, -80)
> -                                # LOE rbx rbp r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> -
> -L(RANGEMASK_CHECK):
> -        btl       %r12d, %r13d
> -
> -/* Call scalar math function */
> -        jc        L(SCALAR_MATH_CALL)
> -                                # LOE rbx rbp r15 r12d r13d
> -
> -/* Special inputs
> - * processing loop
> - */
> +       movq    %r12, 32(%rsp)
> +       movq    %r13, 40(%rsp)
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %r13d
>  L(SPECIAL_VALUES_LOOP):
> -        incl      %r12d
> -        cmpl      $4, %r12d
> -
> -/* Check bits in range mask */
> -        jl        L(RANGEMASK_CHECK)
> -                                # LOE rbx rbp r15 r12d r13d
> -
> -        movq      16(%rsp), %r12
> -        cfi_restore(12)
> -        movq      8(%rsp), %r13
> -        cfi_restore(13)
> -        movq      (%rsp), %r14
> -        cfi_restore(14)
> -        movups    48(%rsp), %xmm0
> -
> -/* Go to exit */
> -        jmp       L(EXIT)
> -        cfi_offset(12, -64)
> -        cfi_offset(13, -72)
> -        cfi_offset(14, -80)
> -                                # LOE rbx rbp r12 r13 r14 r15 xmm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> -
> -L(SCALAR_MATH_CALL):
> -        movl      %r12d, %r14d
> -        movss     32(%rsp,%r14,4), %xmm0
> -        call      atanhf@PLT
> -                                # LOE rbx rbp r14 r15 r12d r13d xmm0
> -
> -        movss     %xmm0, 48(%rsp,%r14,4)
> -
> -/* Process special inputs in loop */
> -        jmp       L(SPECIAL_VALUES_LOOP)
> -                                # LOE rbx rbp r15 r12d r13d
> +       /* use r12 as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %r12d, %r12d
> +       bsfl    %r13d, %r12d
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   (%rsp, %r12, 4), %xmm0
> +       call    atanhf@PLT
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, 16(%rsp, %r12, 4)
> +
> +       leal    -1(%r13), %eax
> +       andl    %eax, %r13d
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +
> +       /* All results have been written to 16(%rsp).  */
> +       vmovups 16(%rsp), %xmm0
> +       movq    32(%rsp), %r12
> +       movq    40(%rsp), %r13
> +       addq    $56, %rsp
> +       ret
>  END(_ZGVbN4v_atanhf_sse4)
>
> -        .section .rodata, "a"
> -        .align 16
> +       .section .rodata, "a"
> +       .align  16
>
>  #ifdef __svml_satanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -        __declspec(align(16)) VUINT32 SgnMask[4][1];
> -        __declspec(align(16)) VUINT32 sOne[4][1];
> -        __declspec(align(16)) VUINT32 sPoly[8][4][1];
> -        __declspec(align(16)) VUINT32 iBrkValue[4][1];
> -        __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
> -        __declspec(align(16)) VUINT32 sHalf[4][1];
> -        __declspec(align(16)) VUINT32 sSign[4][1];
> -        __declspec(align(16)) VUINT32 sTopMask12[4][1];
> -        __declspec(align(16)) VUINT32 TinyRange[4][1];
> -        __declspec(align(16)) VUINT32 sLn2[4][1];
> -} __svml_satanh_data_internal;
> +       typedef unsigned int VUINT32;
> +       typedef struct{
> +       __declspec (align(16))VUINT32 sOne[4][1];
> +       __declspec (align(16))VUINT32 SgnMask[4][1];
> +       __declspec (align(16))VUINT32 sTopMask12[4][1];
> +       __declspec (align(16))VUINT32 iBrkValue[4][1];
> +       __declspec (align(16))VUINT32 iOffExpoMask[4][1];
> +       __declspec (align(16))VUINT32 sPoly[8][4][1];
> +       __declspec (align(16))VUINT32 sLn2[4][1];
> +       __declspec (align(16))VUINT32 TinyRange[4][1];
> +       }__svml_satanh_data_internal;
>  #endif
> +
>  __svml_satanh_data_internal:
> -        /*== SgnMask ==*/
> -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> -        /*== sOne = SP 1.0 ==*/
> -        .align 16
> -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -        /*== sPoly[] = SP polynomial ==*/
> -        .align 16
> -        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> -        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
> -        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> -        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
> -        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> -        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
> -        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> -        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
> -        /*== iBrkValue = SP 2/3 ==*/
> -        .align 16
> -        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> -        /*== iOffExpoMask = SP significand mask ==*/
> -        .align 16
> -        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> -        /*== sHalf ==*/
> -        .align 16
> -        .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> -        /*== sSign ==*/
> -        .align 16
> -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
> -        /*== sTopMask12 ==*/
> -        .align 16
> -        .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> -        /*== TinyRange ==*/
> -        .align 16
> -        .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> -        /*== sLn2 = SP ln(2) ==*/
> -        .align 16
> -        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> -        .align 16
> -        .type  __svml_satanh_data_internal,@object
> -        .size  __svml_satanh_data_internal,.-__svml_satanh_data_internal
> +       /* sOne = SP 1.0.  */
> +       .align  16
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* SgnMask.  */
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       /* sTopMask12.  */
> +       .align  16
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       /* iBrkValue = SP 2/3.  */
> +       .align  16
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       /* iOffExpoMask = SP significand mask.  */
> +       .align  16
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +
> +       /* sPoly[] = SP polynomial.  */
> +       .align  16
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed          /* 1.3820238411426544189453125e-01 P7.  */
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3          /* -1.5122179687023162841796875e-01 P6.  */
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12          /* 1.4042308926582336425781250e-01 P5.  */
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37          /* -1.6472326219081878662109375e-01 P4.  */
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190          /* 2.0007920265197753906250000e-01 P3.  */
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e          /* -2.5004237890243530273437500e-01 P2.  */
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94          /* 3.3333265781402587890625000e-01 P1.  */
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000          /* -5.0000000000000000000000000e-01 P0.  */
> +
> +       /* sLn2 = SP ln(2).  */
> +       .align  16
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       /* TinyRange.  */
> +       .align  16
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       .align  16
> +       .type   __svml_satanh_data_internal, @object
> +       .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> index e67fb5dc92..982029e648 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> @@ -30,306 +30,267 @@
>   *
>   */
>
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> -#define SgnMask                        0
> -#define sOne                           32
> -#define sPoly                          64
> -#define iBrkValue                      320
> -#define iOffExpoMask                   352
> -#define sHalf                          384
> -#define sSign                          416
> -#define sTopMask12                     448
> -#define TinyRange                      480
> -#define sLn2                           512
> +
> +       /* Offsets for data table __svml_satanh_data_internal.  */
> +#define SgnMask        0
> +#define sOne   32
> +#define sTopMask12     64
> +#define TinyRange      96
> +#define iBrkValue      128
> +#define iOffExpoMask   160
> +#define sPoly  192
> +#define sLn2   448
> +#define sHalf  480
>
>  #include <sysdep.h>
> +#define TANHF_DATA(x)  (x)     +       __svml_satanh_data_internal
>
> -        .text
> -       .section .text.avx2,"ax",@progbits
> +       .text
> +       .section .text.avx2, "ax", @progbits
>  ENTRY(_ZGVdN8v_atanhf_avx2)
> -        pushq     %rbp
> -        cfi_def_cfa_offset(16)
> -        movq      %rsp, %rbp
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -        andq      $-32, %rsp
> -        subq      $96, %rsp
> -
> -/* Load constants including One = 1 */
> -        vmovups   sOne+__svml_satanh_data_internal(%rip), %ymm5
> -        vmovups   sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
> -        vmovaps   %ymm0, %ymm6
> -
> -/* Strip off the sign, so treat X as positive until right at the end */
> -        vandps    SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
> -        vsubps    %ymm10, %ymm5, %ymm1
> -
> -/*
> - * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> - * the upper part UHi being <= 12 bits long. Then we have
> - * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> - */
> -        vaddps    %ymm10, %ymm10, %ymm14
> -
> -/*
> - * Check whether |X| < 1, in which case we use the main function.
> - * Otherwise set the rangemask so that the callout will get used.
> - * Note that this will also use the callout for NaNs since not(NaN < 1).
> - */
> -        vcmpnlt_uqps %ymm5, %ymm10, %ymm7
> -        vsubps    %ymm1, %ymm5, %ymm9
> -        vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
> -        vrcpps    %ymm1, %ymm11
> -        vsubps    %ymm10, %ymm9, %ymm12
> -        vandps    %ymm13, %ymm11, %ymm0
> -
> -/* No need to split sU when FMA is available */
> -        vfnmadd213ps %ymm5, %ymm0, %ymm1
> -        vmovaps   %ymm6, %ymm8
> -        vfmadd213ps %ymm6, %ymm6, %ymm8
> -        vfnmadd231ps %ymm0, %ymm12, %ymm1
> -
> -/*
> - * Split V as well into upper 12 bits and lower part, so that we can get
> - * a preliminary quotient estimate without rounding error.
> - */
> -        vandps    %ymm13, %ymm14, %ymm15
> -        vmovmskps %ymm7, %edx
> -        vsubps    %ymm15, %ymm14, %ymm7
> -
> -/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> -        vmulps    %ymm15, %ymm0, %ymm10
> -
> -/* Compute D = E + E^2 */
> -        vfmadd213ps %ymm1, %ymm1, %ymm1
> -
> -/* Record the sign for eventual reincorporation. */
> -        vandps    sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
> -
> -/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> -        vorps     %ymm3, %ymm8, %ymm2
> -        vmulps    %ymm7, %ymm0, %ymm8
> -
> -/*
> - * Compute R * (VHi + VLo) * (1 + E + E^2)
> - * = R *  (VHi + VLo) * (1 + D)
> - * = QHi + (QHi * D + QLo + QLo * D)
> - */
> -        vmulps    %ymm1, %ymm10, %ymm9
> -        vfmadd213ps %ymm8, %ymm8, %ymm1
> -        vaddps    %ymm1, %ymm9, %ymm1
> -
> -/* reduction: compute r,n */
> -        vmovups   iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
> -
> -/*
> - * Now finally accumulate the high and low parts of the
> - * argument to log1p, H + L, with a final compensated summation.
> - */
> -        vaddps    %ymm1, %ymm10, %ymm12
> -        vsubps    %ymm12, %ymm10, %ymm11
> -
> -/*
> - * Now we feed into the log1p code, using H in place of _VARG1 and
> - * later incorporating L into the reduced argument.
> - * compute 1+x as high, low parts
> - */
> -        vmaxps    %ymm12, %ymm5, %ymm13
> -        vminps    %ymm12, %ymm5, %ymm14
> -        vaddps    %ymm11, %ymm1, %ymm0
> -        vaddps    %ymm14, %ymm13, %ymm1
> -        vpsubd    %ymm9, %ymm1, %ymm7
> -        vsubps    %ymm1, %ymm13, %ymm15
> -        vpsrad    $23, %ymm7, %ymm10
> -        vpand     iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
> -        vaddps    %ymm15, %ymm14, %ymm13
> -        vpslld    $23, %ymm10, %ymm11
> -        vpaddd    %ymm9, %ymm8, %ymm15
> -        vaddps    %ymm13, %ymm0, %ymm14
> -        vcvtdq2ps %ymm10, %ymm0
> -        vpsubd    %ymm11, %ymm5, %ymm12
> -
> -/* polynomial evaluation */
> -        vsubps    %ymm5, %ymm15, %ymm5
> -        vmulps    %ymm14, %ymm12, %ymm1
> -        vaddps    %ymm5, %ymm1, %ymm5
> -        vmovups   sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
> -        vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -        vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -        vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -        vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -        vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -        vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -        vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> -        vmulps    %ymm1, %ymm5, %ymm7
> -        vfmadd213ps %ymm5, %ymm5, %ymm7
> -
> -/* final reconstruction */
> -        vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
> -
> -/* Finally, halve the result and reincorporate the sign */
> -        vxorps    sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
> -        vmulps    %ymm0, %ymm3, %ymm0
> -        vblendvps %ymm4, %ymm2, %ymm0, %ymm0
> -        testl     %edx, %edx
> -
> -/* Go to special inputs processing branch */
> -        jne       L(SPECIAL_VALUES_BRANCH)
> -                                # LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
> -
> -/* Restore registers
> - * and exit the function
> - */
> -
> -L(EXIT):
> -        movq      %rbp, %rsp
> -        popq      %rbp
> -        cfi_def_cfa(7, 8)
> -        cfi_restore(6)
> -        ret
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -
> -/* Branch to process
> - * special inputs
> - */
> -
> +       /* Strip off the sign, so treat X as positive until right at the end.
> +        */
> +       vmovaps TANHF_DATA(SgnMask)(%rip), %ymm2
> +       vandps  %ymm2, %ymm0, %ymm3
> +       /* Load constants including One = 1.  */
> +       vmovups TANHF_DATA(sOne)(%rip), %ymm5
> +       vsubps  %ymm3, %ymm5, %ymm1
> +       vmovups TANHF_DATA(sTopMask12)(%rip), %ymm4
> +
> +       vrcpps  %ymm1, %ymm7
> +       vsubps  %ymm1, %ymm5, %ymm9
> +       vandps  %ymm4, %ymm7, %ymm6
> +       vsubps  %ymm3, %ymm9, %ymm7
> +
> +       /* No need to split sU when FMA is available.  */
> +       vfnmadd213ps %ymm5, %ymm6, %ymm1
> +       vmovaps %ymm0, %ymm8
> +       vfmadd213ps %ymm0, %ymm0, %ymm0
> +       vfnmadd231ps %ymm6, %ymm7, %ymm1
> +
> +       /* Check whether |X| < 1, in which case we use the main function.
> +          Otherwise set the rangemask so that the callout will get used. Note that
> +          this will also use the callout for NaNs since not(NaN < 1).  */
> +       vcmpnlt_uqps %ymm5, %ymm3, %ymm14
> +       vcmplt_oqps TANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
> +
> +       /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> +          the upper part UHi being <= 12 bits long. Then we have atanh(X) = 1/2 *
> +          log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).  */
> +       vaddps  %ymm3, %ymm3, %ymm3
> +
> +       /* Split V as well into upper 12 bits and lower part, so that we can get
> +          a preliminary quotient estimate without rounding error.  */
> +       vandps  %ymm4, %ymm3, %ymm4
> +       vsubps  %ymm4, %ymm3, %ymm7
> +
> +       /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.
> +        */
> +       vmulps  %ymm4, %ymm6, %ymm4
> +
> +       /* Compute D = E + E^2.  */
> +       vfmadd213ps %ymm1, %ymm1, %ymm1
> +
> +       /* Record the sign for eventual reincorporation.  */
> +       vandnps %ymm8, %ymm2, %ymm3
> +
> +       /* Or the sign bit in with the tiny result to handle atanh(-0)
> +          correctly.  */
> +       vorps   %ymm3, %ymm0, %ymm13
> +       vmulps  %ymm7, %ymm6, %ymm2
> +
> +       /*
> +          Compute R * (VHi + VLo) * (1 + E + E^2)
> +          = R *  (VHi + VLo) * (1 + D)
> +          = QHi + (QHi * D + QLo + QLo * D)
> +        */
> +       /* If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
> +          vaddps %ymm1, %ymm9, %ymm1` can be replaced with `vfmadd231ps %ymm1,
> +          %ymm4, %ymm4`.  */
> +       vmulps  %ymm1, %ymm4, %ymm6
> +       vfmadd213ps %ymm2, %ymm2, %ymm1
> +       vaddps  %ymm1, %ymm6, %ymm1
> +
> +       /* Now finally accumulate the high and low parts of the argument to
> +          log1p, H + L, with a final compensated summation.  */
> +       vaddps  %ymm1, %ymm4, %ymm2
> +
> +       /* reduction: compute r,n.  */
> +       vmovups TANHF_DATA(iBrkValue)(%rip), %ymm9
> +
> +       /* Now we feed into the log1p code, using H in place of _VARG1 and later
> +          incorporating L into the reduced argument. compute 1+x as high, low
> +          parts.  */
> +       vmaxps  %ymm2, %ymm5, %ymm0
> +       vminps  %ymm2, %ymm5, %ymm6
> +
> +       /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
> +       vsubps  %ymm2, %ymm4, %ymm2
> +       vaddps  %ymm6, %ymm0, %ymm4
> +       vpsubd  %ymm9, %ymm4, %ymm7
> +       vsubps  %ymm4, %ymm0, %ymm4
> +       vaddps  %ymm2, %ymm1, %ymm2
> +       vmovaps TANHF_DATA(iOffExpoMask)(%rip), %ymm1
> +
> +       vandps  %ymm1, %ymm7, %ymm0
> +       vaddps  %ymm4, %ymm6, %ymm4
> +       vandnps %ymm7, %ymm1, %ymm6
> +       vmovups TANHF_DATA(sPoly)(%rip), %ymm1
> +       vpaddd  %ymm9, %ymm0, %ymm0
> +       vaddps  %ymm4, %ymm2, %ymm4
> +       vpsubd  %ymm6, %ymm5, %ymm6
> +
> +       /* polynomial evaluation.  */
> +       vsubps  %ymm5, %ymm0, %ymm2
> +       vfmadd231ps %ymm4, %ymm6, %ymm2
> +       vfmadd213ps TANHF_DATA(sPoly + 32)(%rip), %ymm2, %ymm1
> +       vfmadd213ps TANHF_DATA(sPoly + 64)(%rip), %ymm2, %ymm1
> +       vfmadd213ps TANHF_DATA(sPoly + 96)(%rip), %ymm2, %ymm1
> +       vfmadd213ps TANHF_DATA(sPoly + 128)(%rip), %ymm2, %ymm1
> +       vfmadd213ps TANHF_DATA(sPoly + 160)(%rip), %ymm2, %ymm1
> +       vfmadd213ps TANHF_DATA(sPoly + 192)(%rip), %ymm2, %ymm1
> +       vfmadd213ps TANHF_DATA(sPoly + 224)(%rip), %ymm2, %ymm1
> +
> +       vmulps  %ymm1, %ymm2, %ymm1
> +       vfmadd213ps %ymm2, %ymm2, %ymm1
> +
> +       /* final reconstruction.  */
> +       vpsrad  $23, %ymm7, %ymm6
> +       vcvtdq2ps %ymm6, %ymm2
> +       vfmadd132ps TANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
> +
> +       /* Finally, halve the result and reincorporate the sign.  */
> +       vxorps  TANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
> +       vmulps  %ymm2, %ymm3, %ymm2
> +       vmovmskps %ymm14, %edx
> +       testl   %edx, %edx
> +
> +       vblendvps %ymm15, %ymm13, %ymm2, %ymm0
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +
> +       /* No registers to restore on fast path.  */
> +       ret
> +
> +
> +       /* Branch to process special inputs.  */
>  L(SPECIAL_VALUES_BRANCH):
> -        vmovups   %ymm6, 32(%rsp)
> -        vmovups   %ymm0, 64(%rsp)
> -                                # LOE rbx r12 r13 r14 r15 edx ymm0
> -
> -        xorl      %eax, %eax
> -                                # LOE rbx r12 r13 r14 r15 eax edx
> -
> -        vzeroupper
> -        movq      %r12, 16(%rsp)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> -        movl      %eax, %r12d
> -        movq      %r13, 8(%rsp)
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> -        movl      %edx, %r13d
> -        movq      %r14, (%rsp)
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> -
> -L(RANGEMASK_CHECK):
> -        btl       %r12d, %r13d
> -
> -/* Call scalar math function */
> -        jc        L(SCALAR_MATH_CALL)
> -                                # LOE rbx r15 r12d r13d
> -
> -/* Special inputs
> - * processing loop
> - */
> -
> +       pushq   %rbp
> +       /* Need to callee save registers to preserve state across tanhf calls.
> +        */
> +       pushq   %r12
> +       pushq   %r13
> +       movq    %rsp, %rbp
> +
> +       /* Align stack and make room for 2x ymm vectors.  */
> +       andq    $-32, %rsp
> +       addq    $-64, %rsp
> +
> +       /* Save origional input (ymm0 unchanged up to this point).  */
> +       vmovups %ymm0, (%rsp)
> +       /* Save all already computed inputs.  */
> +       vmovups %ymm8, 32(%rsp)
> +
> +       vzeroupper
> +
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %r13d
>  L(SPECIAL_VALUES_LOOP):
> -        incl      %r12d
> -        cmpl      $8, %r12d
> -
> -/* Check bits in range mask */
> -        jl        L(RANGEMASK_CHECK)
> -                                # LOE rbx r15 r12d r13d
> -
> -        movq      16(%rsp), %r12
> -        cfi_restore(12)
> -        movq      8(%rsp), %r13
> -        cfi_restore(13)
> -        movq      (%rsp), %r14
> -        cfi_restore(14)
> -        vmovups   64(%rsp), %ymm0
> -
> -/* Go to exit */
> -        jmp       L(EXIT)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r12 r13 r14 r15 ymm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> -
> -L(SCALAR_MATH_CALL):
> -        movl      %r12d, %r14d
> -        movss     32(%rsp,%r14,4), %xmm0
> -        call      atanhf@PLT
> -                                # LOE rbx r14 r15 r12d r13d xmm0
> -
> -        movss     %xmm0, 64(%rsp,%r14,4)
> -
> -/* Process special inputs in loop */
> -        jmp       L(SPECIAL_VALUES_LOOP)
> -                                # LOE rbx r15 r12d r13d
> +       /* use r12 as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %r12d, %r12d
> +       tzcntl  %r13d, %r12d
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   32(%rsp, %r12, 4), %xmm0
> +       call    atanhf@PLT
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %r12, 4)
> +
> +       blsr    %r13d, %r13d
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +
> +       /* All results have been written to 32(%rsp).  */
> +       vmovups (%rsp), %ymm0
> +       movq    %rbp, %rsp
> +       popq    %r13
> +       popq    %r12
> +       popq    %rbp
> +       ret
>  END(_ZGVdN8v_atanhf_avx2)
>
> -        .section .rodata, "a"
> -        .align 32
> -
> +       .section .rodata, "a"
> +       .align  32
>  #ifdef __svml_satanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -        __declspec(align(32)) VUINT32 SgnMask[8][1];
> -        __declspec(align(32)) VUINT32 sOne[8][1];
> -        __declspec(align(32)) VUINT32 sPoly[8][8][1];
> -        __declspec(align(32)) VUINT32 iBrkValue[8][1];
> -        __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> -        __declspec(align(32)) VUINT32 sHalf[8][1];
> -        __declspec(align(32)) VUINT32 sSign[8][1];
> -        __declspec(align(32)) VUINT32 sTopMask12[8][1];
> -        __declspec(align(32)) VUINT32 TinyRange[8][1];
> -        __declspec(align(32)) VUINT32 sLn2[8][1];
> -} __svml_satanh_data_internal;
> +       typedef unsigned int VUINT32;
> +       typedef struct{
> +       __declspec (align(32))VUINT32 SgnMask[8][1];
> +       __declspec (align(32))VUINT32 sOne[8][1];
> +       __declspec (align(32))VUINT32 sTopMask12[8][1];
> +       __declspec (align(32))VUINT32 TinyRange[8][1];
> +       __declspec (align(32))VUINT32 iBrkValue[8][1];
> +       __declspec (align(32))VUINT32 iOffExpoMask[8][1];
> +       __declspec (align(32))VUINT32 sPoly[8][8][1];
> +       __declspec (align(32))VUINT32 sLn2[8][1];
> +       __declspec (align(32))VUINT32 sHalf[8][1];
> +       }__svml_satanh_data_internal;
>  #endif
>  __svml_satanh_data_internal:
> -        /*== SgnMask ==*/
> -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> -        /*== sOne = SP 1.0 ==*/
> -        .align 32
> -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -        /*== sPoly[] = SP polynomial ==*/
> -        .align 32
> -        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> -        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
> -        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> -        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
> -        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> -        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
> -        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> -        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
> -        /*== iBrkValue = SP 2/3 ==*/
> -        .align 32
> -        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> -        /*== iOffExpoMask = SP significand mask ==*/
> -        .align 32
> -        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> -        /*== sHalf ==*/
> -        .align 32
> -        .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> -        /*== sSign ==*/
> -        .align 32
> -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> -        /*== sTopMask12 ==*/
> -        .align 32
> -        .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> -        /*== TinyRange ==*/
> -        .align 32
> -        .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> -        /*== sLn2 = SP ln(2) ==*/
> -        .align 32
> -        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> -        .align 32
> -        .type  __svml_satanh_data_internal,@object
> -        .size  __svml_satanh_data_internal,.-__svml_satanh_data_internal
> +       /* SgnMask.  */
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       /* sOne = SP 1.0.  */
> +       .align  32
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* sTopMask12.  */
> +       .align  32
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> +       /* TinyRange.  */
> +       .align  32
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> +       /* iBrkValue = SP 2/3.  */
> +       .align  32
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> +       /* iOffExpoMask = SP significand mask.  */
> +       .align  32
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +       .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +       /* sPoly[] = SP polynomial.  */
> +       .align  32
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
> +       .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed          /* 1.3820238411426544189453125e-01 P7.  */
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
> +       .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3          /* -1.5122179687023162841796875e-01 P6.  */
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
> +       .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12          /* 1.4042308926582336425781250e-01 P5.  */
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
> +       .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37          /* -1.6472326219081878662109375e-01 P4.  */
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
> +       .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190          /* 2.0007920265197753906250000e-01 P3.  */
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
> +       .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e          /* -2.5004237890243530273437500e-01 P2.  */
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
> +       .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94          /* 3.3333265781402587890625000e-01 P1.  */
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000          /* -5.0000000000000000000000000e-01 P0.  */
> +       /* sLn2 = SP ln(2).  */
> +       .align  32
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> +       /* sHalf.  */
> +       .align  32
> +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> +       .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> +       .align  32
> +       .type   __svml_satanh_data_internal, @object
> +       .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal
> --
> 2.25.1
>
  

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index f863f4f959..fbd84b2c8e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -31,363 +31,343 @@ 
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal_avx512
- */
-#define Log_tbl_H                     	0
-#define Log_tbl_L                     	128
-#define One                           	256
-#define AbsMask                       	320
-#define AddB5                         	384
-#define RcpBitMask                    	448
-#define poly_coeff3                   	512
-#define poly_coeff2                   	576
-#define poly_coeff1                   	640
-#define poly_coeff0                   	704
-#define Half                          	768
-#define L2H                           	832
-#define L2L                           	896
+
+    /* Offsets for data table __svml_satanh_data_internal_avx512.  */
+#define AbsMask	0
+#define One	64
+#define AddB5	128
+#define RcpBitMask	192
+#define Log_tbl_L_lo	256
+#define Log_tbl_L_hi	320
+#define Log_tbl_H_lo	384
+#define Log_tbl_H_hi	448
+#define L2H	512
+#define L2L	576
+#define poly_coeff3	640
+#define poly_coeff2	704
+#define poly_coeff1	768
 
 #include <sysdep.h>
+#define TANHF_DATA(x)	(x)	+	__svml_satanh_data_internal_avx512
 
-        .text
-	.section .text.exex512,"ax",@progbits
+	.text
+	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanhf_skx)
-        pushq     %rbp
-        cfi_def_cfa_offset(16)
-        movq      %rsp, %rbp
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-        andq      $-64, %rsp
-        subq      $192, %rsp
-        vmovups   One+__svml_satanh_data_internal_avx512(%rip), %zmm4
-
-/* round reciprocals to 1+5b mantissas */
-        vmovups   AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
-        vmovups   RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
-        vmovaps   %zmm0, %zmm11
-        vandps    AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
-
-/* 1+y */
-        vaddps    {rn-sae}, %zmm4, %zmm6, %zmm9
-
-/* 1-y */
-        vsubps    {rn-sae}, %zmm6, %zmm4, %zmm8
-        vxorps    %zmm6, %zmm11, %zmm10
-
-/* Yp_high */
-        vsubps    {rn-sae}, %zmm4, %zmm9, %zmm2
-
-/* -Ym_high */
-        vsubps    {rn-sae}, %zmm4, %zmm8, %zmm5
-
-/* RcpP ~ 1/Yp */
-        vrcp14ps  %zmm9, %zmm12
-
-/* RcpM ~ 1/Ym */
-        vrcp14ps  %zmm8, %zmm13
-
-/* input outside (-1, 1) ? */
-        vcmpps    $21, {sae}, %zmm4, %zmm6, %k0
-        vpaddd    %zmm14, %zmm12, %zmm15
-        vpaddd    %zmm14, %zmm13, %zmm0
-
-/* Yp_low */
-        vsubps    {rn-sae}, %zmm2, %zmm6, %zmm3
-        vandps    %zmm1, %zmm15, %zmm7
-        vandps    %zmm1, %zmm0, %zmm12
-
-/* Ym_low */
-        vaddps    {rn-sae}, %zmm5, %zmm6, %zmm5
-
-/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
-        vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
-
-/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
-        vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
-        vmovups   Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
-        vmovups   Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
-
-/* exponents */
-        vgetexpps {sae}, %zmm7, %zmm15
-        vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
-
-/* Table lookups */
-        vmovups   __svml_satanh_data_internal_avx512(%rip), %zmm6
-        vgetexpps {sae}, %zmm12, %zmm14
-        vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
-
-/* Prepare table index */
-        vpsrld    $18, %zmm7, %zmm3
-        vpsrld    $18, %zmm12, %zmm2
-        vmovups   Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
-        vmovups   poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
-
-/* Km-Kp */
-        vsubps    {rn-sae}, %zmm15, %zmm14, %zmm1
-        kmovw     %k0, %edx
-        vmovaps   %zmm3, %zmm0
-        vpermi2ps %zmm13, %zmm8, %zmm3
-        vpermt2ps %zmm13, %zmm2, %zmm8
-        vpermi2ps %zmm7, %zmm6, %zmm0
-        vpermt2ps %zmm7, %zmm2, %zmm6
-        vsubps    {rn-sae}, %zmm3, %zmm8, %zmm5
-
-/* K*L2H + Th */
-        vmovups   L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
-
-/* K*L2L + Tl */
-        vmovups   L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
-
-/* polynomials */
-        vmovups   poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
-        vmovups   poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
-
-/* table values */
-        vsubps    {rn-sae}, %zmm0, %zmm6, %zmm0
-        vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
-        vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
-        vmovups   poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
-        vmovaps   %zmm3, %zmm2
-        vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
-        vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
-        vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
-        vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
-        vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
-        vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
-
-/* (K*L2L + Tl) + Rp*PolyP */
-        vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
-        vorps     Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
-
-/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
-        vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
-        vaddps    {rn-sae}, %zmm3, %zmm0, %zmm4
-        vmulps    {rn-sae}, %zmm9, %zmm4, %zmm0
-        testl     %edx, %edx
-
-/* Go to special inputs processing branch */
-        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
-
-/* Restore registers
- * and exit the function
- */
+	vandps	TANHF_DATA(AbsMask)(%rip), %zmm0, %zmm6
+	vmovups	TANHF_DATA(One)(%rip), %zmm4
 
-L(EXIT):
-        movq      %rbp, %rsp
-        popq      %rbp
-        cfi_def_cfa(7, 8)
-        cfi_restore(6)
-        ret
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-
-/* Branch to process
- * special inputs
- */
+	/* 1+y.  */
+	vaddps	{rn-sae}, %zmm4, %zmm6, %zmm9
 
-L(SPECIAL_VALUES_BRANCH):
-        vmovups   %zmm11, 64(%rsp)
-        vmovups   %zmm0, 128(%rsp)
-                                # LOE rbx r12 r13 r14 r15 edx zmm0
-
-        xorl      %eax, %eax
-                                # LOE rbx r12 r13 r14 r15 eax edx
-
-        vzeroupper
-        movq      %r12, 16(%rsp)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-        movl      %eax, %r12d
-        movq      %r13, 8(%rsp)
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-        movl      %edx, %r13d
-        movq      %r14, (%rsp)
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r15 r12d r13d
-
-/* Range mask
- * bits check
- */
+	/* 1-y.  */
+	vsubps	{rn-sae}, %zmm6, %zmm4, %zmm8
 
-L(RANGEMASK_CHECK):
-        btl       %r12d, %r13d
+	/* round reciprocals to 1+5b mantissas.  */
+	vmovups	TANHF_DATA(AddB5)(%rip), %zmm14
+	vmovups	TANHF_DATA(RcpBitMask)(%rip), %zmm1
 
-/* Call scalar math function */
-        jc        L(SCALAR_MATH_CALL)
-                                # LOE rbx r15 r12d r13d
+	/* RcpP ~ 1/Yp.  */
+	vrcp14ps %zmm9, %zmm12
 
-/* Special inputs
- * processing loop
- */
+	/* RcpM ~ 1/Ym.  */
+	vrcp14ps %zmm8, %zmm13
 
-L(SPECIAL_VALUES_LOOP):
-        incl      %r12d
-        cmpl      $16, %r12d
-
-/* Check bits in range mask */
-        jl        L(RANGEMASK_CHECK)
-                                # LOE rbx r15 r12d r13d
-
-        movq      16(%rsp), %r12
-        cfi_restore(12)
-        movq      8(%rsp), %r13
-        cfi_restore(13)
-        movq      (%rsp), %r14
-        cfi_restore(14)
-        vmovups   128(%rsp), %zmm0
-
-/* Go to exit */
-        jmp       L(EXIT)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r12 r13 r14 r15 zmm0
-
-/* Scalar math fucntion call
- * to process special input
- */
+	/* Yp_high.  */
+	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
+
+	/* -Ym_high.  */
+	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+
+
+	/* input outside (-1, 1) ?.  */
+	vpaddd	%zmm14, %zmm12, %zmm15
+	vpaddd	%zmm14, %zmm13, %zmm12
+
+	/* Yp_low.  */
+	vsubps	{rn-sae}, %zmm2, %zmm6, %zmm3
+	vandps	%zmm1, %zmm15, %zmm7
+	vandps	%zmm1, %zmm12, %zmm12
+
+	/* Ym_low.  */
+	vaddps	{rn-sae}, %zmm5, %zmm6, %zmm5
+
+	/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low.  */
+	vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
+
+	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low.  */
+	vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
 
-L(SCALAR_MATH_CALL):
-        movl      %r12d, %r14d
-        movss     64(%rsp,%r14,4), %xmm0
-        call      atanhf@PLT
-                                # LOE rbx r14 r15 r12d r13d xmm0
+	vmovups	TANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
+	vmovups	TANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
 
-        movss     %xmm0, 128(%rsp,%r14,4)
+	/* exponents.  */
+	vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
+	vgetexpps {sae}, %zmm7, %zmm15
 
-/* Process special inputs in loop */
-        jmp       L(SPECIAL_VALUES_LOOP)
-                                # LOE rbx r15 r12d r13d
+
+	/* Table lookups.  */
+	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
+	vgetexpps {sae}, %zmm12, %zmm14
+
+
+	/* Prepare table index.  */
+	vpsrld	$18, %zmm7, %zmm3
+	vpsrld	$18, %zmm12, %zmm2
+	vmovups	TANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
+	vmovups	TANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
+	/* Km-Kp.  */
+
+	vmovaps	%zmm3, %zmm5
+	vpermi2ps %zmm13, %zmm10, %zmm3
+	vpermt2ps %zmm13, %zmm2, %zmm10
+	vpermi2ps %zmm7, %zmm11, %zmm5
+	vpermt2ps %zmm7, %zmm2, %zmm11
+	vsubps	{rn-sae}, %zmm15, %zmm14, %zmm1
+	vsubps	{rn-sae}, %zmm3, %zmm10, %zmm7
+
+	/* K*L2H + Th.  */
+	vmovups	TANHF_DATA(L2H)(%rip), %zmm2
+
+	/* K*L2L + Tl.  */
+	vmovups	TANHF_DATA(L2L)(%rip), %zmm3
+
+	/* table values.  */
+	vsubps	{rn-sae}, %zmm5, %zmm11, %zmm5
+	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
+	vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
+	/* polynomials.  */
+	vmovups	TANHF_DATA(poly_coeff3)(%rip), %zmm7
+	vmovups	TANHF_DATA(poly_coeff2)(%rip), %zmm10
+	vmovaps	%zmm10, %zmm14
+	// vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
+	vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
+	vmovups	TANHF_DATA(poly_coeff1)(%rip), %zmm12
+	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
+	vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
+	vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
+
+	/* (K*L2L + Tl) + Rp*PolyP.  */
+	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
+
+	vandps	%zmm12, %zmm4, %zmm12
+	vpternlogq $246, %zmm0, %zmm6, %zmm12
+
+	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM.  */
+	vfnmadd213ps {rn-sae}, %zmm10, %zmm8, %zmm14
+	vaddps	{rn-sae}, %zmm14, %zmm5, %zmm8
+
+	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
+	kmovw	%k0, %edx
+	testl	%edx, %edx
+
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm0
+
+	ret
+
+	/* Branch to process special inputs.  */
+L(SPECIAL_VALUES_BRANCH):
+	pushq	%rbp
+	/* Need to callee save registers to preserve state across tanhf calls.
+	 */
+	pushq	%r13
+	pushq	%r12
+	movq	%rsp, %rbp
+
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
+	vmulps	{rn-sae}, %zmm12, %zmm8, %zmm1
+	vmovaps	%zmm1, (%rsp)
+	vmovaps	%zmm0, 64(%rsp)
+
+	vzeroupper
+
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %r13d
+L(SPECIAL_VALUES_LOOP):
+	/* use r12 as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%r12d, %r12d
+	tzcntl	%r13d, %r12d
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %r12, 4), %xmm0
+	call	atanhf@PLT
+
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %r12, 4)
+
+	blsr	%r13d, %r13d
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%rbp, %rsp
+	/* Restore callee save registers.  */
+	popq	%r12
+	popq	%r13
+	popq	%rbp
+	ret
 END(_ZGVeN16v_atanhf_skx)
 
-        .section .rodata, "a"
-        .align 64
+	.section .rodata, "a"
+	.align	64
 
 #ifdef __svml_satanh_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-        __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
-        __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
-        __declspec(align(64)) VUINT32 One[16][1];
-        __declspec(align(64)) VUINT32 AbsMask[16][1];
-        __declspec(align(64)) VUINT32 AddB5[16][1];
-        __declspec(align(64)) VUINT32 RcpBitMask[16][1];
-        __declspec(align(64)) VUINT32 poly_coeff3[16][1];
-        __declspec(align(64)) VUINT32 poly_coeff2[16][1];
-        __declspec(align(64)) VUINT32 poly_coeff1[16][1];
-        __declspec(align(64)) VUINT32 poly_coeff0[16][1];
-        __declspec(align(64)) VUINT32 Half[16][1];
-        __declspec(align(64)) VUINT32 L2H[16][1];
-        __declspec(align(64)) VUINT32 L2L[16][1];
-    } __svml_satanh_data_internal_avx512;
+	typedef	unsigned int VUINT32;
+	typedef	struct{
+	__declspec (align(64))VUINT32 AbsMask[16][1];
+	__declspec (align(64))VUINT32 One[16][1];
+	__declspec (align(64))VUINT32 AddB5[16][1];
+	__declspec (align(64))VUINT32 RcpBitMask[16][1];
+	__declspec (align(64))VUINT32 Log_tbl_L_lo[16][1];
+	__declspec (align(64))VUINT32 Log_tbl_L_hi[16][1];
+	__declspec (align(64))VUINT32 Log_tbl_H_lo[16][1];
+	__declspec (align(64))VUINT32 Log_tbl_H_hi[16][1];
+	__declspec (align(64))VUINT32 L2H[16][1];
+	__declspec (align(64))VUINT32 L2L[16][1];
+	__declspec (align(64))VUINT32 poly_coeff3[16][1];
+	__declspec (align(64))VUINT32 poly_coeff2[16][1];
+	__declspec (align(64))VUINT32 poly_coeff1[16][1];
+	}__svml_satanh_data_internal_avx512;
 #endif
 __svml_satanh_data_internal_avx512:
-        /*== Log_tbl_H ==*/
-        .long 0x00000000
-        .long 0x3cfc0000
-        .long 0x3d780000
-        .long 0x3db78000
-        .long 0x3df10000
-        .long 0x3e14c000
-        .long 0x3e300000
-        .long 0x3e4a8000
-        .long 0x3e648000
-        .long 0x3e7dc000
-        .long 0x3e8b4000
-        .long 0x3e974000
-        .long 0x3ea30000
-        .long 0x3eae8000
-        .long 0x3eb9c000
-        .long 0x3ec4e000
-        .long 0x3ecfa000
-        .long 0x3eda2000
-        .long 0x3ee48000
-        .long 0x3eeea000
-        .long 0x3ef8a000
-        .long 0x3f013000
-        .long 0x3f05f000
-        .long 0x3f0aa000
-        .long 0x3f0f4000
-        .long 0x3f13d000
-        .long 0x3f184000
-        .long 0x3f1ca000
-        .long 0x3f20f000
-        .long 0x3f252000
-        .long 0x3f295000
-        .long 0x3f2d7000
-        /*== Log_tbl_L ==*/
-        .align 64
-        .long 0x00000000
-        .long 0x3726c39e
-        .long 0x38a30c01
-        .long 0x37528ae5
-        .long 0x38e0edc5
-        .long 0xb8ab41f8
-        .long 0xb7cf8f58
-        .long 0x3896a73d
-        .long 0xb5838656
-        .long 0x380c36af
-        .long 0xb8235454
-        .long 0x3862bae1
-        .long 0x38c5e10e
-        .long 0x38dedfac
-        .long 0x38ebfb5e
-        .long 0xb8e63c9f
-        .long 0xb85c1340
-        .long 0x38777bcd
-        .long 0xb6038656
-        .long 0x37d40984
-        .long 0xb8b85028
-        .long 0xb8ad5a5a
-        .long 0x3865c84a
-        .long 0x38c3d2f5
-        .long 0x383ebce1
-        .long 0xb8a1ed76
-        .long 0xb7a332c4
-        .long 0xb779654f
-        .long 0xb8602f73
-        .long 0x38f85db0
-        .long 0x37b4996f
-        .long 0xb8bfb3ca
-        /*== One ==*/
-        .align 64
-        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-        /*== AbsMask ==*/
-        .align 64
-        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-        /*== AddB5 ==*/
-        .align 64
-        .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
-        /*== RcpBitMask ==*/
-        .align 64
-        .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
-        /*== poly_coeff3 ==*/
-        .align 64
-        .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
-        /*== poly_coeff2 ==*/
-        .align 64
-        .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
-        /*== poly_coeff1 ==*/
-        .align 64
-        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-        /*== poly_coeff0 ==*/
-        .align 64
-        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-        /*== Half ==*/
-        .align 64
-        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
-        /*== L2H = log(2)_high ==*/
-        .align 64
-        .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-        /*== L2L = log(2)_low ==*/
-        .align 64
-        .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
-        .align 64
-        .type	__svml_satanh_data_internal_avx512,@object
-        .size	__svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
+	/* AbsMask.  */
+	.align	64
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	/* One.  */
+	.align	64
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* AddB5.  */
+	.align	64
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000
+	/* RcpBitMask.  */
+	.align	64
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	/* Log_tbl_L_lo.  */
+	.align	64
+	.long	0x00000000
+	.long	0x3726c39e
+	.long	0x38a30c01
+	.long	0x37528ae5
+	.long	0x38e0edc5
+	.long	0xb8ab41f8
+	.long	0xb7cf8f58
+	.long	0x3896a73d
+	.long	0xb5838656
+	.long	0x380c36af
+	.long	0xb8235454
+	.long	0x3862bae1
+	.long	0x38c5e10e
+	.long	0x38dedfac
+	.long	0x38ebfb5e
+	.long	0xb8e63c9f
+	/* Log_tbl_L_hi.  */
+	.align	64
+	.long	0xb85c1340
+	.long	0x38777bcd
+	.long	0xb6038656
+	.long	0x37d40984
+	.long	0xb8b85028
+	.long	0xb8ad5a5a
+	.long	0x3865c84a
+	.long	0x38c3d2f5
+	.long	0x383ebce1
+	.long	0xb8a1ed76
+	.long	0xb7a332c4
+	.long	0xb779654f
+	.long	0xb8602f73
+	.long	0x38f85db0
+	.long	0x37b4996f
+	.long	0xb8bfb3ca
+	/* Log_tbl_H_lo.  */
+	.align	64
+	.long	0x00000000
+	.long	0x3cfc0000
+	.long	0x3d780000
+	.long	0x3db78000
+	.long	0x3df10000
+	.long	0x3e14c000
+	.long	0x3e300000
+	.long	0x3e4a8000
+	.long	0x3e648000
+	.long	0x3e7dc000
+	.long	0x3e8b4000
+	.long	0x3e974000
+	.long	0x3ea30000
+	.long	0x3eae8000
+	.long	0x3eb9c000
+	.long	0x3ec4e000
+	/* Log_tbl_H_hi.  */
+	.align	64
+	.long	0x3ecfa000
+	.long	0x3eda2000
+	.long	0x3ee48000
+	.long	0x3eeea000
+	.long	0x3ef8a000
+	.long	0x3f013000
+	.long	0x3f05f000
+	.long	0x3f0aa000
+	.long	0x3f0f4000
+	.long	0x3f13d000
+	.long	0x3f184000
+	.long	0x3f1ca000
+	.long	0x3f20f000
+	.long	0x3f252000
+	.long	0x3f295000
+	.long	0x3f2d7000
+	/* L2H = log(2)_high.  */
+	.align	64
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	/* L2L = log(2)_low.  */
+	.align	64
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	/* poly_coeff3.  */
+	.align	64
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	/* poly_coeff2.  */
+	.align	64
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	/* poly_coeff1.  */
+	.align	64
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.align	64
+	.type	__svml_satanh_data_internal_avx512, @object
+	.size	__svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
index 7927e01f0c..e1a8a28a3d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
@@ -28,334 +28,278 @@ 
  *   atanh(-1) = -INF
  *   atanh(x)  = NaN if |x| > 1, or if x is a NaN or INF
  *
- */
-
-/* Offsets for data table __svml_satanh_data_internal
- */
-#define SgnMask                       	0
-#define sOne                          	16
-#define sPoly                         	32
-#define iBrkValue                     	160
-#define iOffExpoMask                  	176
-#define sHalf                         	192
-#define sSign                         	208
-#define sTopMask12                    	224
-#define TinyRange                     	240
-#define sLn2                          	256
+*/
 
-#include <sysdep.h>
 
-        .text
-	.section .text.sse4,"ax",@progbits
-ENTRY(_ZGVbN4v_atanhf_sse4)
-        subq      $72, %rsp
-        cfi_def_cfa_offset(80)
-        movaps    %xmm0, %xmm5
-
-/* Load constants including One = 1 */
-        movups    sOne+__svml_satanh_data_internal(%rip), %xmm4
-        movaps    %xmm5, %xmm3
-
-/* Strip off the sign, so treat X as positive until right at the end */
-        movups    SgnMask+__svml_satanh_data_internal(%rip), %xmm7
-        movaps    %xmm4, %xmm8
-        andps     %xmm5, %xmm7
-        movaps    %xmm4, %xmm10
-        movups    sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
-        movaps    %xmm4, %xmm14
-        movaps    %xmm11, %xmm9
+	/* Offsets for data table __svml_satanh_data_internal.  */
+#define sOne	0
+#define SgnMask	16
+#define sTopMask12	32
+#define iBrkValue	48
+#define iOffExpoMask	64
+#define sPoly	80
+#define sLn2	208
+#define TinyRange	224
 
-/*
- * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
- * the upper part UHi being <= 12 bits long. Then we have
- * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
- */
-        movaps    %xmm7, %xmm12
-
-/*
- * Check whether |X| < 1, in which case we use the main function.
- * Otherwise set the rangemask so that the callout will get used.
- * Note that this will also use the callout for NaNs since not(NaN < 1).
- */
-        movaps    %xmm7, %xmm6
-        movaps    %xmm7, %xmm2
-        cmpnltps  %xmm4, %xmm6
-        cmpltps   TinyRange+__svml_satanh_data_internal(%rip), %xmm2
-        mulps     %xmm5, %xmm3
-        subps     %xmm7, %xmm8
-        addps     %xmm7, %xmm12
-        movmskps  %xmm6, %edx
-        subps     %xmm8, %xmm10
-        addps     %xmm5, %xmm3
-        subps     %xmm7, %xmm10
-        andps     %xmm8, %xmm9
-
-/*
- * Now we feed into the log1p code, using H in place of _VARG1 and
- * later incorporating L into the reduced argument.
- * compute 1+x as high, low parts
- */
-        movaps    %xmm4, %xmm7
-
-/*
- * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
- * The first FMR is exact (we force R to 12 bits just in case it
- * isn't already, to make absolutely sure), and since E is ~ 2^-12,
- * the rounding error in the other one is acceptable.
- */
-        rcpps     %xmm9, %xmm15
-        subps     %xmm9, %xmm8
-        andps     %xmm11, %xmm15
 
-/*
- * Split V as well into upper 12 bits and lower part, so that we can get
- * a preliminary quotient estimate without rounding error.
- */
-        andps     %xmm12, %xmm11
-        mulps     %xmm15, %xmm9
-        addps     %xmm8, %xmm10
-        subps     %xmm11, %xmm12
-
-/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-        mulps     %xmm15, %xmm11
-        mulps     %xmm15, %xmm10
-        subps     %xmm9, %xmm14
-        mulps     %xmm12, %xmm15
-        subps     %xmm10, %xmm14
-
-/* Compute D = E + E^2 */
-        movaps    %xmm14, %xmm13
-        movaps    %xmm4, %xmm8
-        mulps     %xmm14, %xmm13
-
-/* reduction: compute r,n */
-        movdqu    iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
-        addps     %xmm13, %xmm14
+#include <sysdep.h>
+#define TANHF_DATA(x)	(x)	+	__svml_satanh_data_internal
 
-/*
- * Compute R * (VHi + VLo) * (1 + E + E^2)
- * = R *  (VHi + VLo) * (1 + D)
- * = QHi + (QHi * D + QLo + QLo * D)
- */
-        movaps    %xmm14, %xmm0
-        mulps     %xmm15, %xmm14
-        mulps     %xmm11, %xmm0
-        addps     %xmm14, %xmm15
-        movdqu    iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
-        movaps    %xmm4, %xmm14
-
-/* Record the sign for eventual reincorporation. */
-        movups    sSign+__svml_satanh_data_internal(%rip), %xmm1
-        addps     %xmm15, %xmm0
+	.text
+	.section .text.sse4, "ax", @progbits
+ENTRY(_ZGVbN4v_atanhf_sse4)
+	movaps	%xmm0, %xmm5
+
+	/* Load constants including One = 1.  */
+	movups	TANHF_DATA(sOne)(%rip), %xmm4
+	movaps	%xmm5, %xmm3
+
+	/* Strip off the sign, so treat X as positive until right at the end.
+	 */
+	movups	TANHF_DATA(SgnMask)(%rip), %xmm1
+	movaps	%xmm4, %xmm2
+	andps	%xmm1, %xmm0
+	movaps	%xmm4, %xmm10
+	movups	TANHF_DATA(sTopMask12)(%rip), %xmm11
+	movaps	%xmm4, %xmm14
+	movaps	%xmm11, %xmm9
+
+
+	/* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
+	   the upper part UHi being <= 12 bits long. Then we have atanh(X) = 1/2 *
+	   log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).  */
+	movaps	%xmm0, %xmm6
+	mulps	%xmm5, %xmm3
+	subps	%xmm0, %xmm2
+	addps	%xmm0, %xmm6
+	subps	%xmm2, %xmm10
+	addps	%xmm5, %xmm3
+	subps	%xmm0, %xmm10
+	andps	%xmm2, %xmm9
+
+
+	/* Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E The first
+	   FMR is exact (we force R to 12 bits just in case it isn't already, to
+	   make absolutely sure), and since E is ~ 2^-12, the rounding error in the
+	   other one is acceptable.  */
+	rcpps	%xmm9, %xmm7
+	subps	%xmm9, %xmm2
+	andps	%xmm11, %xmm7
+
+
+	/* Split V as well into upper 12 bits and lower part, so that we can get
+	   a preliminary quotient estimate without rounding error.  */
+
+	andps	%xmm6, %xmm11
+	mulps	%xmm7, %xmm9
+	addps	%xmm2, %xmm10
+	subps	%xmm11, %xmm6
+
+	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.
+	 */
+	mulps	%xmm7, %xmm11
+	mulps	%xmm7, %xmm10
+	subps	%xmm9, %xmm14
+	mulps	%xmm6, %xmm7
+	subps	%xmm10, %xmm14
+
+	/* Compute D = E + E^2.  */
+	movaps	%xmm14, %xmm13
+	movaps	%xmm4, %xmm8
+	mulps	%xmm14, %xmm13
+
+	/* reduction: compute r,n.  */
+	movdqu	TANHF_DATA(iBrkValue)(%rip), %xmm9
+	addps	%xmm13, %xmm14
+
+	/*
+	 * Compute R * (VHi + VLo) * (1 + E + E^2)
+	 * = R *  (VHi + VLo) * (1 + D)
+	 * = QHi + (QHi * D + QLo + QLo * D)
+	 */
+	movaps	%xmm14, %xmm2
+	mulps	%xmm7, %xmm14
+	mulps	%xmm11, %xmm2
+	addps	%xmm14, %xmm7
+	movdqu	TANHF_DATA(iOffExpoMask)(%rip), %xmm12
+	movaps	%xmm4, %xmm14
+
+	/* Record the sign for eventual reincorporation.  */
+	addps	%xmm7, %xmm2
+
+
+	/* Now finally accumulate the high and low parts of the argument to
+	   log1p, H + L, with a final compensated summation.  */
+	movaps	%xmm2, %xmm6
+	andnps	%xmm5, %xmm1
+	movaps	%xmm4, %xmm7
+	/* Or the sign bit in with the tiny result to handle atanh(-0)
+	   correctly.  */
+	addps	%xmm11, %xmm6
+	maxps	%xmm6, %xmm7
+	minps	%xmm6, %xmm8
+	subps	%xmm6, %xmm11
+	movaps	%xmm7, %xmm10
+	addps	%xmm8, %xmm10
+	addps	%xmm11, %xmm2
+	subps	%xmm10, %xmm7
+	psubd	%xmm9, %xmm10
+	addps	%xmm8, %xmm7
+	pand	%xmm10, %xmm12
+	psrad	$23, %xmm10
+	cvtdq2ps %xmm10, %xmm13
+	addps	%xmm7, %xmm2
+
+	/* final reconstruction.  */
+	pslld	$23, %xmm10
+	paddd	%xmm9, %xmm12
+	psubd	%xmm10, %xmm14
+
+	/* polynomial evaluation.  */
+	subps	%xmm4, %xmm12
+	mulps	%xmm14, %xmm2
+	movups	TANHF_DATA(sPoly)(%rip), %xmm7
+	addps	%xmm12, %xmm2
+	mulps	%xmm2, %xmm7
+
+
+	/* Finally, halve the result and reincorporate the sign.  */
+	addps	TANHF_DATA(sPoly + 16)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	TANHF_DATA(sPoly + 32)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	TANHF_DATA(sPoly + 48)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	TANHF_DATA(sPoly + 64)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	TANHF_DATA(sPoly + 80)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	addps	TANHF_DATA(sPoly + 96)(%rip), %xmm7
+	mulps	%xmm2, %xmm7
+	movaps	TANHF_DATA(sPoly + 112)(%rip), %xmm6
+	addps	%xmm6, %xmm7
+	mulps	%xmm2, %xmm7
+	mulps	%xmm2, %xmm7
+	mulps	TANHF_DATA(sLn2)(%rip), %xmm13
+	/* We can build `sHalf` with `sPoly & sOne`.  */
+	andps	%xmm4, %xmm6
+	orps	%xmm1, %xmm3
+	xorps	%xmm6, %xmm1
+
+	addps	%xmm2, %xmm7
+	addps	%xmm13, %xmm7
+	mulps	%xmm7, %xmm1
+
+	/* Check whether |X| < 1, in which case we use the main function.
+	   Otherwise set the rangemask so that the callout will get used. Note that
+	   this will also use the callout for NaNs since not(NaN < 1).  */
+	cmpleps	%xmm0, %xmm4
+	movmskps %xmm4, %edx
+	cmpltps	TANHF_DATA(TinyRange)(%rip), %xmm0
+
+	andps	%xmm0, %xmm3
+	andnps	%xmm1, %xmm0
+	orps	%xmm3, %xmm0
+
+	testl	%edx, %edx
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+
+	/* No registers to restore on fast path.  */
+	ret
+
+	/* Branch to process special inputs.  */
+L(SPECIAL_VALUES_BRANCH):
+	subq	$56, %rsp
 
-/*
- * Now finally accumulate the high and low parts of the
- * argument to log1p, H + L, with a final compensated summation.
- */
-        movaps    %xmm0, %xmm6
-        andps     %xmm5, %xmm1
-
-/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-        orps      %xmm1, %xmm3
-        addps     %xmm11, %xmm6
-        maxps     %xmm6, %xmm7
-        minps     %xmm6, %xmm8
-        subps     %xmm6, %xmm11
-        movaps    %xmm7, %xmm10
-        andps     %xmm2, %xmm3
-        addps     %xmm8, %xmm10
-        addps     %xmm11, %xmm0
-        subps     %xmm10, %xmm7
-        psubd     %xmm9, %xmm10
-        addps     %xmm7, %xmm8
-        pand      %xmm10, %xmm12
-        psrad     $23, %xmm10
-        cvtdq2ps  %xmm10, %xmm13
-        addps     %xmm8, %xmm0
-
-/* final reconstruction */
-        mulps     sLn2+__svml_satanh_data_internal(%rip), %xmm13
-        pslld     $23, %xmm10
-        paddd     %xmm9, %xmm12
-        psubd     %xmm10, %xmm14
-
-/* polynomial evaluation */
-        subps     %xmm4, %xmm12
-        mulps     %xmm0, %xmm14
-        movups    sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
-        addps     %xmm12, %xmm14
-        mulps     %xmm14, %xmm0
-
-/* Finally, halve the result and reincorporate the sign */
-        movups    sHalf+__svml_satanh_data_internal(%rip), %xmm4
-        pxor      %xmm1, %xmm4
-        addps     sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
-        mulps     %xmm14, %xmm0
-        addps     sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
-        mulps     %xmm14, %xmm0
-        addps     sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
-        mulps     %xmm14, %xmm0
-        addps     sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
-        mulps     %xmm14, %xmm0
-        addps     sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
-        mulps     %xmm14, %xmm0
-        addps     sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
-        mulps     %xmm14, %xmm0
-        addps     sPoly+__svml_satanh_data_internal(%rip), %xmm0
-        mulps     %xmm14, %xmm0
-        mulps     %xmm14, %xmm0
-        addps     %xmm0, %xmm14
-        movaps    %xmm2, %xmm0
-        addps     %xmm13, %xmm14
-        mulps     %xmm14, %xmm4
-        andnps    %xmm4, %xmm0
-        orps      %xmm3, %xmm0
-        testl     %edx, %edx
-
-/* Go to special inputs processing branch */
-        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
-
-/* Restore registers
- * and exit the function
- */
-
-L(EXIT):
-        addq      $72, %rsp
-        cfi_def_cfa_offset(8)
-        ret
-        cfi_def_cfa_offset(80)
-
-/* Branch to process
- * special inputs
- */
+	movups	%xmm5, (%rsp)
+	movups	%xmm0, 16(%rsp)
 
-L(SPECIAL_VALUES_BRANCH):
-        movups    %xmm5, 32(%rsp)
-        movups    %xmm0, 48(%rsp)
-                                # LOE rbx rbp r12 r13 r14 r15 edx
-
-        xorl      %eax, %eax
-        movq      %r12, 16(%rsp)
-        cfi_offset(12, -64)
-        movl      %eax, %r12d
-        movq      %r13, 8(%rsp)
-        cfi_offset(13, -72)
-        movl      %edx, %r13d
-        movq      %r14, (%rsp)
-        cfi_offset(14, -80)
-                                # LOE rbx rbp r15 r12d r13d
-
-/* Range mask
- * bits check
- */
-
-L(RANGEMASK_CHECK):
-        btl       %r12d, %r13d
-
-/* Call scalar math function */
-        jc        L(SCALAR_MATH_CALL)
-                                # LOE rbx rbp r15 r12d r13d
-
-/* Special inputs
- * processing loop
- */
+	movq	%r12, 32(%rsp)
+	movq	%r13, 40(%rsp)
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %r13d
 L(SPECIAL_VALUES_LOOP):
-        incl      %r12d
-        cmpl      $4, %r12d
-
-/* Check bits in range mask */
-        jl        L(RANGEMASK_CHECK)
-                                # LOE rbx rbp r15 r12d r13d
-
-        movq      16(%rsp), %r12
-        cfi_restore(12)
-        movq      8(%rsp), %r13
-        cfi_restore(13)
-        movq      (%rsp), %r14
-        cfi_restore(14)
-        movups    48(%rsp), %xmm0
-
-/* Go to exit */
-        jmp       L(EXIT)
-        cfi_offset(12, -64)
-        cfi_offset(13, -72)
-        cfi_offset(14, -80)
-                                # LOE rbx rbp r12 r13 r14 r15 xmm0
-
-/* Scalar math fucntion call
- * to process special input
- */
-
-L(SCALAR_MATH_CALL):
-        movl      %r12d, %r14d
-        movss     32(%rsp,%r14,4), %xmm0
-        call      atanhf@PLT
-                                # LOE rbx rbp r14 r15 r12d r13d xmm0
-
-        movss     %xmm0, 48(%rsp,%r14,4)
-
-/* Process special inputs in loop */
-        jmp       L(SPECIAL_VALUES_LOOP)
-                                # LOE rbx rbp r15 r12d r13d
+	/* use r12 as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%r12d, %r12d
+	bsfl	%r13d, %r12d
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	(%rsp, %r12, 4), %xmm0
+	call	atanhf@PLT
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, 16(%rsp, %r12, 4)
+
+	leal	-1(%r13), %eax
+	andl	%eax, %r13d
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 16(%rsp).  */
+	vmovups	16(%rsp), %xmm0
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	addq	$56, %rsp
+	ret
 END(_ZGVbN4v_atanhf_sse4)
 
-        .section .rodata, "a"
-        .align 16
+	.section .rodata, "a"
+	.align	16
 
 #ifdef __svml_satanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-        __declspec(align(16)) VUINT32 SgnMask[4][1];
-        __declspec(align(16)) VUINT32 sOne[4][1];
-        __declspec(align(16)) VUINT32 sPoly[8][4][1];
-        __declspec(align(16)) VUINT32 iBrkValue[4][1];
-        __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
-        __declspec(align(16)) VUINT32 sHalf[4][1];
-        __declspec(align(16)) VUINT32 sSign[4][1];
-        __declspec(align(16)) VUINT32 sTopMask12[4][1];
-        __declspec(align(16)) VUINT32 TinyRange[4][1];
-        __declspec(align(16)) VUINT32 sLn2[4][1];
-} __svml_satanh_data_internal;
+	typedef	unsigned int VUINT32;
+	typedef	struct{
+	__declspec (align(16))VUINT32 sOne[4][1];
+	__declspec (align(16))VUINT32 SgnMask[4][1];
+	__declspec (align(16))VUINT32 sTopMask12[4][1];
+	__declspec (align(16))VUINT32 iBrkValue[4][1];
+	__declspec (align(16))VUINT32 iOffExpoMask[4][1];
+	__declspec (align(16))VUINT32 sPoly[8][4][1];
+	__declspec (align(16))VUINT32 sLn2[4][1];
+	__declspec (align(16))VUINT32 TinyRange[4][1];
+	}__svml_satanh_data_internal;
 #endif
+
 __svml_satanh_data_internal:
-        /*== SgnMask ==*/
-        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-        /*== sOne = SP 1.0 ==*/
-        .align 16
-        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-        /*== sPoly[] = SP polynomial ==*/
-        .align 16
-        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
-        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
-        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
-        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
-        /*== iBrkValue = SP 2/3 ==*/
-        .align 16
-        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-        /*== iOffExpoMask = SP significand mask ==*/
-        .align 16
-        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-        /*== sHalf ==*/
-        .align 16
-        .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-        /*== sSign ==*/
-        .align 16
-        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
-        /*== sTopMask12 ==*/
-        .align 16
-        .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-        /*== TinyRange ==*/
-        .align 16
-        .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-        /*== sLn2 = SP ln(2) ==*/
-        .align 16
-        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
-        .align 16
-        .type	__svml_satanh_data_internal,@object
-        .size	__svml_satanh_data_internal,.-__svml_satanh_data_internal
+	/* sOne = SP 1.0.  */
+	.align	16
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* SgnMask.  */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	/* sTopMask12.  */
+	.align	16
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	/* iBrkValue = SP 2/3.  */
+	.align	16
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	/* iOffExpoMask = SP significand mask.  */
+	.align	16
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+
+	/* sPoly[] = SP polynomial.  */
+	.align	16
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed		/* 1.3820238411426544189453125e-01 P7.  */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3		/* -1.5122179687023162841796875e-01 P6.  */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12		/* 1.4042308926582336425781250e-01 P5.  */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37		/* -1.6472326219081878662109375e-01 P4.  */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190		/* 2.0007920265197753906250000e-01 P3.  */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e		/* -2.5004237890243530273437500e-01 P2.  */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94		/* 3.3333265781402587890625000e-01 P1.  */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000		/* -5.0000000000000000000000000e-01 P0.  */
+
+	/* sLn2 = SP ln(2).  */
+	.align	16
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	/* TinyRange.  */
+	.align	16
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.align	16
+	.type	__svml_satanh_data_internal, @object
+	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index e67fb5dc92..982029e648 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -30,306 +30,267 @@ 
  *
  */
 
-/* Offsets for data table __svml_satanh_data_internal
- */
-#define SgnMask                       	0
-#define sOne                          	32
-#define sPoly                         	64
-#define iBrkValue                     	320
-#define iOffExpoMask                  	352
-#define sHalf                         	384
-#define sSign                         	416
-#define sTopMask12                    	448
-#define TinyRange                     	480
-#define sLn2                          	512
+
+	/* Offsets for data table __svml_satanh_data_internal.  */
+#define SgnMask	0
+#define sOne	32
+#define sTopMask12	64
+#define TinyRange	96
+#define iBrkValue	128
+#define iOffExpoMask	160
+#define sPoly	192
+#define sLn2	448
+#define sHalf	480
 
 #include <sysdep.h>
+#define TANHF_DATA(x)	(x)	+	__svml_satanh_data_internal
 
-        .text
-	.section .text.avx2,"ax",@progbits
+	.text
+	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN8v_atanhf_avx2)
-        pushq     %rbp
-        cfi_def_cfa_offset(16)
-        movq      %rsp, %rbp
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-        andq      $-32, %rsp
-        subq      $96, %rsp
-
-/* Load constants including One = 1 */
-        vmovups   sOne+__svml_satanh_data_internal(%rip), %ymm5
-        vmovups   sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
-        vmovaps   %ymm0, %ymm6
-
-/* Strip off the sign, so treat X as positive until right at the end */
-        vandps    SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
-        vsubps    %ymm10, %ymm5, %ymm1
-
-/*
- * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
- * the upper part UHi being <= 12 bits long. Then we have
- * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
- */
-        vaddps    %ymm10, %ymm10, %ymm14
-
-/*
- * Check whether |X| < 1, in which case we use the main function.
- * Otherwise set the rangemask so that the callout will get used.
- * Note that this will also use the callout for NaNs since not(NaN < 1).
- */
-        vcmpnlt_uqps %ymm5, %ymm10, %ymm7
-        vsubps    %ymm1, %ymm5, %ymm9
-        vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
-        vrcpps    %ymm1, %ymm11
-        vsubps    %ymm10, %ymm9, %ymm12
-        vandps    %ymm13, %ymm11, %ymm0
-
-/* No need to split sU when FMA is available */
-        vfnmadd213ps %ymm5, %ymm0, %ymm1
-        vmovaps   %ymm6, %ymm8
-        vfmadd213ps %ymm6, %ymm6, %ymm8
-        vfnmadd231ps %ymm0, %ymm12, %ymm1
-
-/*
- * Split V as well into upper 12 bits and lower part, so that we can get
- * a preliminary quotient estimate without rounding error.
- */
-        vandps    %ymm13, %ymm14, %ymm15
-        vmovmskps %ymm7, %edx
-        vsubps    %ymm15, %ymm14, %ymm7
-
-/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
-        vmulps    %ymm15, %ymm0, %ymm10
-
-/* Compute D = E + E^2 */
-        vfmadd213ps %ymm1, %ymm1, %ymm1
-
-/* Record the sign for eventual reincorporation. */
-        vandps    sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
-
-/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
-        vorps     %ymm3, %ymm8, %ymm2
-        vmulps    %ymm7, %ymm0, %ymm8
-
-/*
- * Compute R * (VHi + VLo) * (1 + E + E^2)
- * = R *  (VHi + VLo) * (1 + D)
- * = QHi + (QHi * D + QLo + QLo * D)
- */
-        vmulps    %ymm1, %ymm10, %ymm9
-        vfmadd213ps %ymm8, %ymm8, %ymm1
-        vaddps    %ymm1, %ymm9, %ymm1
-
-/* reduction: compute r,n */
-        vmovups   iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
-
-/*
- * Now finally accumulate the high and low parts of the
- * argument to log1p, H + L, with a final compensated summation.
- */
-        vaddps    %ymm1, %ymm10, %ymm12
-        vsubps    %ymm12, %ymm10, %ymm11
-
-/*
- * Now we feed into the log1p code, using H in place of _VARG1 and
- * later incorporating L into the reduced argument.
- * compute 1+x as high, low parts
- */
-        vmaxps    %ymm12, %ymm5, %ymm13
-        vminps    %ymm12, %ymm5, %ymm14
-        vaddps    %ymm11, %ymm1, %ymm0
-        vaddps    %ymm14, %ymm13, %ymm1
-        vpsubd    %ymm9, %ymm1, %ymm7
-        vsubps    %ymm1, %ymm13, %ymm15
-        vpsrad    $23, %ymm7, %ymm10
-        vpand     iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
-        vaddps    %ymm15, %ymm14, %ymm13
-        vpslld    $23, %ymm10, %ymm11
-        vpaddd    %ymm9, %ymm8, %ymm15
-        vaddps    %ymm13, %ymm0, %ymm14
-        vcvtdq2ps %ymm10, %ymm0
-        vpsubd    %ymm11, %ymm5, %ymm12
-
-/* polynomial evaluation */
-        vsubps    %ymm5, %ymm15, %ymm5
-        vmulps    %ymm14, %ymm12, %ymm1
-        vaddps    %ymm5, %ymm1, %ymm5
-        vmovups   sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
-        vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-        vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-        vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-        vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-        vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-        vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-        vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
-        vmulps    %ymm1, %ymm5, %ymm7
-        vfmadd213ps %ymm5, %ymm5, %ymm7
-
-/* final reconstruction */
-        vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
-
-/* Finally, halve the result and reincorporate the sign */
-        vxorps    sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
-        vmulps    %ymm0, %ymm3, %ymm0
-        vblendvps %ymm4, %ymm2, %ymm0, %ymm0
-        testl     %edx, %edx
-
-/* Go to special inputs processing branch */
-        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
-
-/* Restore registers
- * and exit the function
- */
-
-L(EXIT):
-        movq      %rbp, %rsp
-        popq      %rbp
-        cfi_def_cfa(7, 8)
-        cfi_restore(6)
-        ret
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-
-/* Branch to process
- * special inputs
- */
-
+	/* Strip off the sign, so treat X as positive until right at the end.
+	 */
+	vmovaps	TANHF_DATA(SgnMask)(%rip), %ymm2
+	vandps	%ymm2, %ymm0, %ymm3
+	/* Load constants including One = 1.  */
+	vmovups	TANHF_DATA(sOne)(%rip), %ymm5
+	vsubps	%ymm3, %ymm5, %ymm1
+	vmovups	TANHF_DATA(sTopMask12)(%rip), %ymm4
+
+	vrcpps	%ymm1, %ymm7
+	vsubps	%ymm1, %ymm5, %ymm9
+	vandps	%ymm4, %ymm7, %ymm6
+	vsubps	%ymm3, %ymm9, %ymm7
+
+	/* No need to split sU when FMA is available.  */
+	vfnmadd213ps %ymm5, %ymm6, %ymm1
+	vmovaps	%ymm0, %ymm8
+	vfmadd213ps %ymm0, %ymm0, %ymm0
+	vfnmadd231ps %ymm6, %ymm7, %ymm1
+
+	/* Check whether |X| < 1, in which case we use the main function.
+	   Otherwise set the rangemask so that the callout will get used. Note that
+	   this will also use the callout for NaNs since not(NaN < 1).  */
+	vcmpnlt_uqps %ymm5, %ymm3, %ymm14
+	vcmplt_oqps TANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
+
+	/* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
+	   the upper part UHi being <= 12 bits long. Then we have atanh(X) = 1/2 *
+	   log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).  */
+	vaddps	%ymm3, %ymm3, %ymm3
+
+	/* Split V as well into upper 12 bits and lower part, so that we can get
+	   a preliminary quotient estimate without rounding error.  */
+	vandps	%ymm4, %ymm3, %ymm4
+	vsubps	%ymm4, %ymm3, %ymm7
+
+	/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.
+	 */
+	vmulps	%ymm4, %ymm6, %ymm4
+
+	/* Compute D = E + E^2.  */
+	vfmadd213ps %ymm1, %ymm1, %ymm1
+
+	/* Record the sign for eventual reincorporation.  */
+	vandnps	%ymm8, %ymm2, %ymm3
+
+	/* Or the sign bit in with the tiny result to handle atanh(-0)
+	   correctly.  */
+	vorps	%ymm3, %ymm0, %ymm13
+	vmulps	%ymm7, %ymm6, %ymm2
+
+	/*
+	   Compute R * (VHi + VLo) * (1 + E + E^2)
+	   = R *  (VHi + VLo) * (1 + D)
+	   = QHi + (QHi * D + QLo + QLo * D)
+	 */
+	/* If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
+	   vaddps %ymm1, %ymm9, %ymm1` can be replaced with `vfmadd231ps %ymm1,
+	   %ymm4, %ymm4`.  */
+	vmulps	%ymm1, %ymm4, %ymm6
+	vfmadd213ps %ymm2, %ymm2, %ymm1
+	vaddps	%ymm1, %ymm6, %ymm1
+
+	/* Now finally accumulate the high and low parts of the argument to
+	   log1p, H + L, with a final compensated summation.  */
+	vaddps	%ymm1, %ymm4, %ymm2
+
+	/* reduction: compute r,n.  */
+	vmovups	TANHF_DATA(iBrkValue)(%rip), %ymm9
+
+	/* Now we feed into the log1p code, using H in place of _VARG1 and later
+	   incorporating L into the reduced argument. compute 1+x as high, low
+	   parts.  */
+	vmaxps	%ymm2, %ymm5, %ymm0
+	vminps	%ymm2, %ymm5, %ymm6
+
+	/* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`).  */
+	vsubps	%ymm2, %ymm4, %ymm2
+	vaddps	%ymm6, %ymm0, %ymm4
+	vpsubd	%ymm9, %ymm4, %ymm7
+	vsubps	%ymm4, %ymm0, %ymm4
+	vaddps	%ymm2, %ymm1, %ymm2
+	vmovaps	TANHF_DATA(iOffExpoMask)(%rip), %ymm1
+
+	vandps	%ymm1, %ymm7, %ymm0
+	vaddps	%ymm4, %ymm6, %ymm4
+	vandnps	%ymm7, %ymm1, %ymm6
+	vmovups	TANHF_DATA(sPoly)(%rip), %ymm1
+	vpaddd	%ymm9, %ymm0, %ymm0
+	vaddps	%ymm4, %ymm2, %ymm4
+	vpsubd	%ymm6, %ymm5, %ymm6
+
+	/* polynomial evaluation.  */
+	vsubps	%ymm5, %ymm0, %ymm2
+	vfmadd231ps %ymm4, %ymm6, %ymm2
+	vfmadd213ps TANHF_DATA(sPoly + 32)(%rip), %ymm2, %ymm1
+	vfmadd213ps TANHF_DATA(sPoly + 64)(%rip), %ymm2, %ymm1
+	vfmadd213ps TANHF_DATA(sPoly + 96)(%rip), %ymm2, %ymm1
+	vfmadd213ps TANHF_DATA(sPoly + 128)(%rip), %ymm2, %ymm1
+	vfmadd213ps TANHF_DATA(sPoly + 160)(%rip), %ymm2, %ymm1
+	vfmadd213ps TANHF_DATA(sPoly + 192)(%rip), %ymm2, %ymm1
+	vfmadd213ps TANHF_DATA(sPoly + 224)(%rip), %ymm2, %ymm1
+
+	vmulps	%ymm1, %ymm2, %ymm1
+	vfmadd213ps %ymm2, %ymm2, %ymm1
+
+	/* final reconstruction.  */
+	vpsrad	$23, %ymm7, %ymm6
+	vcvtdq2ps %ymm6, %ymm2
+	vfmadd132ps TANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
+
+	/* Finally, halve the result and reincorporate the sign.  */
+	vxorps	TANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
+	vmulps	%ymm2, %ymm3, %ymm2
+	vmovmskps %ymm14, %edx
+	testl	%edx, %edx
+
+	vblendvps %ymm15, %ymm13, %ymm2, %ymm0
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+
+	/* No registers to restore on fast path.  */
+	ret
+
+
+	/* Branch to process special inputs.  */
 L(SPECIAL_VALUES_BRANCH):
-        vmovups   %ymm6, 32(%rsp)
-        vmovups   %ymm0, 64(%rsp)
-                                # LOE rbx r12 r13 r14 r15 edx ymm0
-
-        xorl      %eax, %eax
-                                # LOE rbx r12 r13 r14 r15 eax edx
-
-        vzeroupper
-        movq      %r12, 16(%rsp)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-        movl      %eax, %r12d
-        movq      %r13, 8(%rsp)
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-        movl      %edx, %r13d
-        movq      %r14, (%rsp)
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r15 r12d r13d
-
-/* Range mask
- * bits check
- */
-
-L(RANGEMASK_CHECK):
-        btl       %r12d, %r13d
-
-/* Call scalar math function */
-        jc        L(SCALAR_MATH_CALL)
-                                # LOE rbx r15 r12d r13d
-
-/* Special inputs
- * processing loop
- */
-
+	pushq	%rbp
+	/* Need to callee save registers to preserve state across tanhf calls.
+	 */
+	pushq	%r12
+	pushq	%r13
+	movq	%rsp, %rbp
+
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
+
+	/* Save origional input (ymm0 unchanged up to this point).  */
+	vmovups	%ymm0, (%rsp)
+	/* Save all already computed inputs.  */
+	vmovups	%ymm8, 32(%rsp)
+
+	vzeroupper
+
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %r13d
 L(SPECIAL_VALUES_LOOP):
-        incl      %r12d
-        cmpl      $8, %r12d
-
-/* Check bits in range mask */
-        jl        L(RANGEMASK_CHECK)
-                                # LOE rbx r15 r12d r13d
-
-        movq      16(%rsp), %r12
-        cfi_restore(12)
-        movq      8(%rsp), %r13
-        cfi_restore(13)
-        movq      (%rsp), %r14
-        cfi_restore(14)
-        vmovups   64(%rsp), %ymm0
-
-/* Go to exit */
-        jmp       L(EXIT)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r12 r13 r14 r15 ymm0
-
-/* Scalar math fucntion call
- * to process special input
- */
-
-L(SCALAR_MATH_CALL):
-        movl      %r12d, %r14d
-        movss     32(%rsp,%r14,4), %xmm0
-        call      atanhf@PLT
-                                # LOE rbx r14 r15 r12d r13d xmm0
-
-        movss     %xmm0, 64(%rsp,%r14,4)
-
-/* Process special inputs in loop */
-        jmp       L(SPECIAL_VALUES_LOOP)
-                                # LOE rbx r15 r12d r13d
+	/* use r12 as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%r12d, %r12d
+	tzcntl	%r13d, %r12d
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %r12, 4), %xmm0
+	call	atanhf@PLT
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %r12, 4)
+
+	blsr	%r13d, %r13d
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 32(%rsp).  */
+	vmovups	(%rsp), %ymm0
+	movq	%rbp, %rsp
+	popq	%r13
+	popq	%r12
+	popq	%rbp
+	ret
 END(_ZGVdN8v_atanhf_avx2)
 
-        .section .rodata, "a"
-        .align 32
-
+	.section .rodata, "a"
+	.align	32
 #ifdef __svml_satanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-        __declspec(align(32)) VUINT32 SgnMask[8][1];
-        __declspec(align(32)) VUINT32 sOne[8][1];
-        __declspec(align(32)) VUINT32 sPoly[8][8][1];
-        __declspec(align(32)) VUINT32 iBrkValue[8][1];
-        __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
-        __declspec(align(32)) VUINT32 sHalf[8][1];
-        __declspec(align(32)) VUINT32 sSign[8][1];
-        __declspec(align(32)) VUINT32 sTopMask12[8][1];
-        __declspec(align(32)) VUINT32 TinyRange[8][1];
-        __declspec(align(32)) VUINT32 sLn2[8][1];
-} __svml_satanh_data_internal;
+	typedef	unsigned int VUINT32;
+	typedef	struct{
+	__declspec (align(32))VUINT32 SgnMask[8][1];
+	__declspec (align(32))VUINT32 sOne[8][1];
+	__declspec (align(32))VUINT32 sTopMask12[8][1];
+	__declspec (align(32))VUINT32 TinyRange[8][1];
+	__declspec (align(32))VUINT32 iBrkValue[8][1];
+	__declspec (align(32))VUINT32 iOffExpoMask[8][1];
+	__declspec (align(32))VUINT32 sPoly[8][8][1];
+	__declspec (align(32))VUINT32 sLn2[8][1];
+	__declspec (align(32))VUINT32 sHalf[8][1];
+	}__svml_satanh_data_internal;
 #endif
 __svml_satanh_data_internal:
-        /*== SgnMask ==*/
-        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-        /*== sOne = SP 1.0 ==*/
-        .align 32
-        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-        /*== sPoly[] = SP polynomial ==*/
-        .align 32
-        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-        .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /*  3.3333265781402587890625000e-01 P1 */
-        .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
-        .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /*  2.0007920265197753906250000e-01 P3 */
-        .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
-        .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /*  1.4042308926582336425781250e-01 P5 */
-        .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
-        .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /*  1.3820238411426544189453125e-01 P7 */
-        /*== iBrkValue = SP 2/3 ==*/
-        .align 32
-        .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
-        /*== iOffExpoMask = SP significand mask ==*/
-        .align 32
-        .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-        /*== sHalf ==*/
-        .align 32
-        .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
-        /*== sSign ==*/
-        .align 32
-        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
-        /*== sTopMask12 ==*/
-        .align 32
-        .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
-        /*== TinyRange ==*/
-        .align 32
-        .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
-        /*== sLn2 = SP ln(2) ==*/
-        .align 32
-        .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
-        .align 32
-        .type	__svml_satanh_data_internal,@object
-        .size	__svml_satanh_data_internal,.-__svml_satanh_data_internal
+	/* SgnMask.  */
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	/* sOne = SP 1.0.  */
+	.align	32
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* sTopMask12.  */
+	.align	32
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	.long	0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+	/* TinyRange.  */
+	.align	32
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	.long	0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+	/* iBrkValue = SP 2/3.  */
+	.align	32
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	.long	0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+	/* iOffExpoMask = SP significand mask.  */
+	.align	32
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	.long	0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+	/* sPoly[] = SP polynomial.  */
+	.align	32
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
+	.long	0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed		/* 1.3820238411426544189453125e-01 P7.  */
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
+	.long	0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3		/* -1.5122179687023162841796875e-01 P6.  */
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
+	.long	0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12		/* 1.4042308926582336425781250e-01 P5.  */
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
+	.long	0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37		/* -1.6472326219081878662109375e-01 P4.  */
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
+	.long	0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190		/* 2.0007920265197753906250000e-01 P3.  */
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
+	.long	0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e		/* -2.5004237890243530273437500e-01 P2.  */
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
+	.long	0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94		/* 3.3333265781402587890625000e-01 P1.  */
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000		/* -5.0000000000000000000000000e-01 P0.  */
+	/* sLn2 = SP ln(2).  */
+	.align	32
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	.long	0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+	/* sHalf.  */
+	.align	32
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
+	.long	0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
+	.align	32
+	.type	__svml_satanh_data_internal, @object
+	.size	__svml_satanh_data_internal, .-__svml_satanh_data_internal