[v1] x86: Optimize svml_s_atanhf_core_{ss4|avx2|avx512}.S
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
No bug.
Optimizations are:
1. Reduce code size
avx512: -58 bytes
avx2: -53 bytes
sse4: -54 bytes
2. Reduce rodata size
avx512: -128 bytes
avx2: -32 bytes
sse4: -16 bytes
3. Remove register save/restores and stack adjustement from the
fast path.
4. Slightly improve instruction selection/scheduling where
possible.
5. Slightly improve register choices to remove redundant moves
and/or use register that get smaller instruction
encoding (avx2/sse4 only).
The result is ~7% speedup for avx2/sse4 and ~17% speedup for avx512.
Results from geomean of 40 benchtest runs:
Function, New Time, Old Time, New / Old
_ZGVbN4v_atanhf, 22.492, 24.143, 0.932
_ZGVcN8v_atanhf, 23.606, 25.231, 0.936
_ZGVdN8v_atanhf, 15.768, 16.841, 0.936
_ZGVeN16v_atanhf, 11.434, 13.816, 0.828
All math and mathvec tests are passing.
---
.../multiarch/svml_s_atanhf16_core_avx512.S | 662 +++++++++---------
.../fpu/multiarch/svml_s_atanhf4_core_sse4.S | 576 +++++++--------
.../fpu/multiarch/svml_s_atanhf8_core_avx2.S | 545 +++++++-------
3 files changed, 834 insertions(+), 949 deletions(-)
Comments
On Tue, Feb 1, 2022 at 1:10 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
>
> Optimizations are:
> 1. Reduce code size
> avx512: -58 bytes
> avx2: -53 bytes
> sse4: -54 bytes
> 2. Reduce rodata size
> avx512: -128 bytes
> avx2: -32 bytes
> sse4: -16 bytes
> 3. Remove register save/restores and stack adjustement from the
> fast path.
> 4. Slightly improve instruction selection/scheduling where
> possible.
> 5. Slightly improve register choices to remove redundant moves
> and/or use register that get smaller instruction
> encoding (avx2/sse4 only).
>
> The result is ~7% speedup for avx2/sse4 and ~17% speedup for avx512.
>
> Results from geomean of 40 benchtest runs:
>
> Function, New Time, Old Time, New / Old
> _ZGVbN4v_atanhf, 22.492, 24.143, 0.932
> _ZGVcN8v_atanhf, 23.606, 25.231, 0.936
> _ZGVdN8v_atanhf, 15.768, 16.841, 0.936
> _ZGVeN16v_atanhf, 11.434, 13.816, 0.828
Note, the avx512 version increase ULP from 1.4 -> 2.4. That appears to be
within the documented range from:
commit 3e63b15d43ea6f61effcf92324e47e981bd7d0a8
Author: Sunil K Pandey <skpgkp2@gmail.com>
Date: Tue Jan 18 07:07:44 2022 -0800
x86_64: Document libmvec vector functions accuracy [BZ #28766]
Document maximum 4 ulps accuracy for x86_64 libmvec functions.
This fixes BZ #28766.
and reducing the precision here gets about 10% of the 17% speedup.
Is this an acceptable tradeoff?
>
> All math and mathvec tests are passing.
> ---
> .../multiarch/svml_s_atanhf16_core_avx512.S | 662 +++++++++---------
> .../fpu/multiarch/svml_s_atanhf4_core_sse4.S | 576 +++++++--------
> .../fpu/multiarch/svml_s_atanhf8_core_avx2.S | 545 +++++++-------
> 3 files changed, 834 insertions(+), 949 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> index f863f4f959..fbd84b2c8e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> @@ -31,363 +31,343 @@
> *
> */
>
> -/* Offsets for data table __svml_satanh_data_internal_avx512
> - */
> -#define Log_tbl_H 0
> -#define Log_tbl_L 128
> -#define One 256
> -#define AbsMask 320
> -#define AddB5 384
> -#define RcpBitMask 448
> -#define poly_coeff3 512
> -#define poly_coeff2 576
> -#define poly_coeff1 640
> -#define poly_coeff0 704
> -#define Half 768
> -#define L2H 832
> -#define L2L 896
> +
> + /* Offsets for data table __svml_satanh_data_internal_avx512. */
> +#define AbsMask 0
> +#define One 64
> +#define AddB5 128
> +#define RcpBitMask 192
> +#define Log_tbl_L_lo 256
> +#define Log_tbl_L_hi 320
> +#define Log_tbl_H_lo 384
> +#define Log_tbl_H_hi 448
> +#define L2H 512
> +#define L2L 576
> +#define poly_coeff3 640
> +#define poly_coeff2 704
> +#define poly_coeff1 768
>
> #include <sysdep.h>
> +#define TANHF_DATA(x) (x) + __svml_satanh_data_internal_avx512
>
> - .text
> - .section .text.exex512,"ax",@progbits
> + .text
> + .section .text.exex512, "ax", @progbits
> ENTRY(_ZGVeN16v_atanhf_skx)
> - pushq %rbp
> - cfi_def_cfa_offset(16)
> - movq %rsp, %rbp
> - cfi_def_cfa(6, 16)
> - cfi_offset(6, -16)
> - andq $-64, %rsp
> - subq $192, %rsp
> - vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> -
> -/* round reciprocals to 1+5b mantissas */
> - vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> - vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> - vmovaps %zmm0, %zmm11
> - vandps AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> -
> -/* 1+y */
> - vaddps {rn-sae}, %zmm4, %zmm6, %zmm9
> -
> -/* 1-y */
> - vsubps {rn-sae}, %zmm6, %zmm4, %zmm8
> - vxorps %zmm6, %zmm11, %zmm10
> -
> -/* Yp_high */
> - vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
> -
> -/* -Ym_high */
> - vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
> -
> -/* RcpP ~ 1/Yp */
> - vrcp14ps %zmm9, %zmm12
> -
> -/* RcpM ~ 1/Ym */
> - vrcp14ps %zmm8, %zmm13
> -
> -/* input outside (-1, 1) ? */
> - vcmpps $21, {sae}, %zmm4, %zmm6, %k0
> - vpaddd %zmm14, %zmm12, %zmm15
> - vpaddd %zmm14, %zmm13, %zmm0
> -
> -/* Yp_low */
> - vsubps {rn-sae}, %zmm2, %zmm6, %zmm3
> - vandps %zmm1, %zmm15, %zmm7
> - vandps %zmm1, %zmm0, %zmm12
> -
> -/* Ym_low */
> - vaddps {rn-sae}, %zmm5, %zmm6, %zmm5
> -
> -/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
> - vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> -
> -/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> - vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> - vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> - vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> -
> -/* exponents */
> - vgetexpps {sae}, %zmm7, %zmm15
> - vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> -
> -/* Table lookups */
> - vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> - vgetexpps {sae}, %zmm12, %zmm14
> - vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> -
> -/* Prepare table index */
> - vpsrld $18, %zmm7, %zmm3
> - vpsrld $18, %zmm12, %zmm2
> - vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> - vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> -
> -/* Km-Kp */
> - vsubps {rn-sae}, %zmm15, %zmm14, %zmm1
> - kmovw %k0, %edx
> - vmovaps %zmm3, %zmm0
> - vpermi2ps %zmm13, %zmm8, %zmm3
> - vpermt2ps %zmm13, %zmm2, %zmm8
> - vpermi2ps %zmm7, %zmm6, %zmm0
> - vpermt2ps %zmm7, %zmm2, %zmm6
> - vsubps {rn-sae}, %zmm3, %zmm8, %zmm5
> -
> -/* K*L2H + Th */
> - vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> -
> -/* K*L2L + Tl */
> - vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -
> -/* polynomials */
> - vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> - vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> -
> -/* table values */
> - vsubps {rn-sae}, %zmm0, %zmm6, %zmm0
> - vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> - vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> - vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> - vmovaps %zmm3, %zmm2
> - vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> - vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> - vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> - vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> - vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> - vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> -
> -/* (K*L2L + Tl) + Rp*PolyP */
> - vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> - vorps Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> -
> -/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> - vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> - vaddps {rn-sae}, %zmm3, %zmm0, %zmm4
> - vmulps {rn-sae}, %zmm9, %zmm4, %zmm0
> - testl %edx, %edx
> -
> -/* Go to special inputs processing branch */
> - jne L(SPECIAL_VALUES_BRANCH)
> - # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> -
> -/* Restore registers
> - * and exit the function
> - */
> + vandps TANHF_DATA(AbsMask)(%rip), %zmm0, %zmm6
> + vmovups TANHF_DATA(One)(%rip), %zmm4
>
> -L(EXIT):
> - movq %rbp, %rsp
> - popq %rbp
> - cfi_def_cfa(7, 8)
> - cfi_restore(6)
> - ret
> - cfi_def_cfa(6, 16)
> - cfi_offset(6, -16)
> -
> -/* Branch to process
> - * special inputs
> - */
> + /* 1+y. */
> + vaddps {rn-sae}, %zmm4, %zmm6, %zmm9
>
> -L(SPECIAL_VALUES_BRANCH):
> - vmovups %zmm11, 64(%rsp)
> - vmovups %zmm0, 128(%rsp)
> - # LOE rbx r12 r13 r14 r15 edx zmm0
> -
> - xorl %eax, %eax
> - # LOE rbx r12 r13 r14 r15 eax edx
> -
> - vzeroupper
> - movq %r12, 16(%rsp)
> - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> - movl %eax, %r12d
> - movq %r13, 8(%rsp)
> - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> - movl %edx, %r13d
> - movq %r14, (%rsp)
> - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> - # LOE rbx r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> + /* 1-y. */
> + vsubps {rn-sae}, %zmm6, %zmm4, %zmm8
>
> -L(RANGEMASK_CHECK):
> - btl %r12d, %r13d
> + /* round reciprocals to 1+5b mantissas. */
> + vmovups TANHF_DATA(AddB5)(%rip), %zmm14
> + vmovups TANHF_DATA(RcpBitMask)(%rip), %zmm1
>
> -/* Call scalar math function */
> - jc L(SCALAR_MATH_CALL)
> - # LOE rbx r15 r12d r13d
> + /* RcpP ~ 1/Yp. */
> + vrcp14ps %zmm9, %zmm12
>
> -/* Special inputs
> - * processing loop
> - */
> + /* RcpM ~ 1/Ym. */
> + vrcp14ps %zmm8, %zmm13
>
> -L(SPECIAL_VALUES_LOOP):
> - incl %r12d
> - cmpl $16, %r12d
> -
> -/* Check bits in range mask */
> - jl L(RANGEMASK_CHECK)
> - # LOE rbx r15 r12d r13d
> -
> - movq 16(%rsp), %r12
> - cfi_restore(12)
> - movq 8(%rsp), %r13
> - cfi_restore(13)
> - movq (%rsp), %r14
> - cfi_restore(14)
> - vmovups 128(%rsp), %zmm0
> -
> -/* Go to exit */
> - jmp L(EXIT)
> - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> - # LOE rbx r12 r13 r14 r15 zmm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> + /* Yp_high. */
> + vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
> +
> + /* -Ym_high. */
> + vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
> +
> +
> + /* input outside (-1, 1) ?. */
> + vpaddd %zmm14, %zmm12, %zmm15
> + vpaddd %zmm14, %zmm13, %zmm12
> +
> + /* Yp_low. */
> + vsubps {rn-sae}, %zmm2, %zmm6, %zmm3
> + vandps %zmm1, %zmm15, %zmm7
> + vandps %zmm1, %zmm12, %zmm12
> +
> + /* Ym_low. */
> + vaddps {rn-sae}, %zmm5, %zmm6, %zmm5
> +
> + /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low. */
> + vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> +
> + /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low. */
> + vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
>
> -L(SCALAR_MATH_CALL):
> - movl %r12d, %r14d
> - movss 64(%rsp,%r14,4), %xmm0
> - call atanhf@PLT
> - # LOE rbx r14 r15 r12d r13d xmm0
> + vmovups TANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
> + vmovups TANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
>
> - movss %xmm0, 128(%rsp,%r14,4)
> + /* exponents. */
> + vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> + vgetexpps {sae}, %zmm7, %zmm15
>
> -/* Process special inputs in loop */
> - jmp L(SPECIAL_VALUES_LOOP)
> - # LOE rbx r15 r12d r13d
> +
> + /* Table lookups. */
> + vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
> + vgetexpps {sae}, %zmm12, %zmm14
> +
> +
> + /* Prepare table index. */
> + vpsrld $18, %zmm7, %zmm3
> + vpsrld $18, %zmm12, %zmm2
> + vmovups TANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
> + vmovups TANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
> + /* Km-Kp. */
> +
> + vmovaps %zmm3, %zmm5
> + vpermi2ps %zmm13, %zmm10, %zmm3
> + vpermt2ps %zmm13, %zmm2, %zmm10
> + vpermi2ps %zmm7, %zmm11, %zmm5
> + vpermt2ps %zmm7, %zmm2, %zmm11
> + vsubps {rn-sae}, %zmm15, %zmm14, %zmm1
> + vsubps {rn-sae}, %zmm3, %zmm10, %zmm7
> +
> + /* K*L2H + Th. */
> + vmovups TANHF_DATA(L2H)(%rip), %zmm2
> +
> + /* K*L2L + Tl. */
> + vmovups TANHF_DATA(L2L)(%rip), %zmm3
> +
> + /* table values. */
> + vsubps {rn-sae}, %zmm5, %zmm11, %zmm5
> + vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
> + vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
> + /* polynomials. */
> + vmovups TANHF_DATA(poly_coeff3)(%rip), %zmm7
> + vmovups TANHF_DATA(poly_coeff2)(%rip), %zmm10
> + vmovaps %zmm10, %zmm14
> + // vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
> + vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
> + vmovups TANHF_DATA(poly_coeff1)(%rip), %zmm12
> + vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
> + vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
> + vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
> + vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
> +
> + /* (K*L2L + Tl) + Rp*PolyP. */
> + vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
> +
> + vandps %zmm12, %zmm4, %zmm12
> + vpternlogq $246, %zmm0, %zmm6, %zmm12
> +
> + /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM. */
> + vfnmadd213ps {rn-sae}, %zmm10, %zmm8, %zmm14
> + vaddps {rn-sae}, %zmm14, %zmm5, %zmm8
> +
> + vcmpps $21, {sae}, %zmm4, %zmm6, %k0
> + kmovw %k0, %edx
> + testl %edx, %edx
> +
> + /* Go to special inputs processing branch. */
> + jne L(SPECIAL_VALUES_BRANCH)
> + vmulps {rn-sae}, %zmm12, %zmm8, %zmm0
> +
> + ret
> +
> + /* Branch to process special inputs. */
> +L(SPECIAL_VALUES_BRANCH):
> + pushq %rbp
> + /* Need to callee save registers to preserve state across tanhf calls.
> + */
> + pushq %r13
> + pushq %r12
> + movq %rsp, %rbp
> +
> + /* Align stack and make room for 2x zmm vectors. */
> + andq $-64, %rsp
> + addq $-128, %rsp
> + vmulps {rn-sae}, %zmm12, %zmm8, %zmm1
> + vmovaps %zmm1, (%rsp)
> + vmovaps %zmm0, 64(%rsp)
> +
> + vzeroupper
> +
> + /* edx has 1s where there was a special value that needs to be handled
> + by a tanhf call. */
> + movl %edx, %r13d
> +L(SPECIAL_VALUES_LOOP):
> + /* use r12 as index for special value that is saved across calls to
> + tanhf. We technically don't need a callee save register here as offset
> + to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> + Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> + in the loop. */
> + xorl %r12d, %r12d
> + tzcntl %r13d, %r12d
> +
> + /* Scalar math fucntion call to process special input. */
> + movss 64(%rsp, %r12, 4), %xmm0
> + call atanhf@PLT
> +
> + /* No good way to avoid the store-forwarding fault this will cause on
> + return. `lfence` avoids the SF fault but at greater cost as it
> + serialized stack/callee save restoration. */
> + movss %xmm0, (%rsp, %r12, 4)
> +
> + blsr %r13d, %r13d
> + jnz L(SPECIAL_VALUES_LOOP)
> +
> + /* All results have been written to 64(%rsp). */
> + vmovaps (%rsp), %zmm0
> + /* Restore rsp. */
> + movq %rbp, %rsp
> + /* Restore callee save registers. */
> + popq %r12
> + popq %r13
> + popq %rbp
> + ret
> END(_ZGVeN16v_atanhf_skx)
>
> - .section .rodata, "a"
> - .align 64
> + .section .rodata, "a"
> + .align 64
>
> #ifdef __svml_satanh_data_internal_avx512_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> - __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> - __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> - __declspec(align(64)) VUINT32 One[16][1];
> - __declspec(align(64)) VUINT32 AbsMask[16][1];
> - __declspec(align(64)) VUINT32 AddB5[16][1];
> - __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> - __declspec(align(64)) VUINT32 poly_coeff3[16][1];
> - __declspec(align(64)) VUINT32 poly_coeff2[16][1];
> - __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> - __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> - __declspec(align(64)) VUINT32 Half[16][1];
> - __declspec(align(64)) VUINT32 L2H[16][1];
> - __declspec(align(64)) VUINT32 L2L[16][1];
> - } __svml_satanh_data_internal_avx512;
> + typedef unsigned int VUINT32;
> + typedef struct{
> + __declspec (align(64))VUINT32 AbsMask[16][1];
> + __declspec (align(64))VUINT32 One[16][1];
> + __declspec (align(64))VUINT32 AddB5[16][1];
> + __declspec (align(64))VUINT32 RcpBitMask[16][1];
> + __declspec (align(64))VUINT32 Log_tbl_L_lo[16][1];
> + __declspec (align(64))VUINT32 Log_tbl_L_hi[16][1];
> + __declspec (align(64))VUINT32 Log_tbl_H_lo[16][1];
> + __declspec (align(64))VUINT32 Log_tbl_H_hi[16][1];
> + __declspec (align(64))VUINT32 L2H[16][1];
> + __declspec (align(64))VUINT32 L2L[16][1];
> + __declspec (align(64))VUINT32 poly_coeff3[16][1];
> + __declspec (align(64))VUINT32 poly_coeff2[16][1];
> + __declspec (align(64))VUINT32 poly_coeff1[16][1];
> + }__svml_satanh_data_internal_avx512;
> #endif
> __svml_satanh_data_internal_avx512:
> - /*== Log_tbl_H ==*/
> - .long 0x00000000
> - .long 0x3cfc0000
> - .long 0x3d780000
> - .long 0x3db78000
> - .long 0x3df10000
> - .long 0x3e14c000
> - .long 0x3e300000
> - .long 0x3e4a8000
> - .long 0x3e648000
> - .long 0x3e7dc000
> - .long 0x3e8b4000
> - .long 0x3e974000
> - .long 0x3ea30000
> - .long 0x3eae8000
> - .long 0x3eb9c000
> - .long 0x3ec4e000
> - .long 0x3ecfa000
> - .long 0x3eda2000
> - .long 0x3ee48000
> - .long 0x3eeea000
> - .long 0x3ef8a000
> - .long 0x3f013000
> - .long 0x3f05f000
> - .long 0x3f0aa000
> - .long 0x3f0f4000
> - .long 0x3f13d000
> - .long 0x3f184000
> - .long 0x3f1ca000
> - .long 0x3f20f000
> - .long 0x3f252000
> - .long 0x3f295000
> - .long 0x3f2d7000
> - /*== Log_tbl_L ==*/
> - .align 64
> - .long 0x00000000
> - .long 0x3726c39e
> - .long 0x38a30c01
> - .long 0x37528ae5
> - .long 0x38e0edc5
> - .long 0xb8ab41f8
> - .long 0xb7cf8f58
> - .long 0x3896a73d
> - .long 0xb5838656
> - .long 0x380c36af
> - .long 0xb8235454
> - .long 0x3862bae1
> - .long 0x38c5e10e
> - .long 0x38dedfac
> - .long 0x38ebfb5e
> - .long 0xb8e63c9f
> - .long 0xb85c1340
> - .long 0x38777bcd
> - .long 0xb6038656
> - .long 0x37d40984
> - .long 0xb8b85028
> - .long 0xb8ad5a5a
> - .long 0x3865c84a
> - .long 0x38c3d2f5
> - .long 0x383ebce1
> - .long 0xb8a1ed76
> - .long 0xb7a332c4
> - .long 0xb779654f
> - .long 0xb8602f73
> - .long 0x38f85db0
> - .long 0x37b4996f
> - .long 0xb8bfb3ca
> - /*== One ==*/
> - .align 64
> - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> - /*== AbsMask ==*/
> - .align 64
> - .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> - /*== AddB5 ==*/
> - .align 64
> - .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> - /*== RcpBitMask ==*/
> - .align 64
> - .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> - /*== poly_coeff3 ==*/
> - .align 64
> - .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> - /*== poly_coeff2 ==*/
> - .align 64
> - .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> - /*== poly_coeff1 ==*/
> - .align 64
> - .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> - /*== poly_coeff0 ==*/
> - .align 64
> - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> - /*== Half ==*/
> - .align 64
> - .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> - /*== L2H = log(2)_high ==*/
> - .align 64
> - .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> - /*== L2L = log(2)_low ==*/
> - .align 64
> - .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> - .align 64
> - .type __svml_satanh_data_internal_avx512,@object
> - .size __svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
> + /* AbsMask. */
> + .align 64
> + .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> + .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> + .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> + .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> + /* One. */
> + .align 64
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> + /* AddB5. */
> + .align 64
> + .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
> + .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
> + .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
> + .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
> + /* RcpBitMask. */
> + .align 64
> + .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> + .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> + .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> + .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> + /* Log_tbl_L_lo. */
> + .align 64
> + .long 0x00000000
> + .long 0x3726c39e
> + .long 0x38a30c01
> + .long 0x37528ae5
> + .long 0x38e0edc5
> + .long 0xb8ab41f8
> + .long 0xb7cf8f58
> + .long 0x3896a73d
> + .long 0xb5838656
> + .long 0x380c36af
> + .long 0xb8235454
> + .long 0x3862bae1
> + .long 0x38c5e10e
> + .long 0x38dedfac
> + .long 0x38ebfb5e
> + .long 0xb8e63c9f
> + /* Log_tbl_L_hi. */
> + .align 64
> + .long 0xb85c1340
> + .long 0x38777bcd
> + .long 0xb6038656
> + .long 0x37d40984
> + .long 0xb8b85028
> + .long 0xb8ad5a5a
> + .long 0x3865c84a
> + .long 0x38c3d2f5
> + .long 0x383ebce1
> + .long 0xb8a1ed76
> + .long 0xb7a332c4
> + .long 0xb779654f
> + .long 0xb8602f73
> + .long 0x38f85db0
> + .long 0x37b4996f
> + .long 0xb8bfb3ca
> + /* Log_tbl_H_lo. */
> + .align 64
> + .long 0x00000000
> + .long 0x3cfc0000
> + .long 0x3d780000
> + .long 0x3db78000
> + .long 0x3df10000
> + .long 0x3e14c000
> + .long 0x3e300000
> + .long 0x3e4a8000
> + .long 0x3e648000
> + .long 0x3e7dc000
> + .long 0x3e8b4000
> + .long 0x3e974000
> + .long 0x3ea30000
> + .long 0x3eae8000
> + .long 0x3eb9c000
> + .long 0x3ec4e000
> + /* Log_tbl_H_hi. */
> + .align 64
> + .long 0x3ecfa000
> + .long 0x3eda2000
> + .long 0x3ee48000
> + .long 0x3eeea000
> + .long 0x3ef8a000
> + .long 0x3f013000
> + .long 0x3f05f000
> + .long 0x3f0aa000
> + .long 0x3f0f4000
> + .long 0x3f13d000
> + .long 0x3f184000
> + .long 0x3f1ca000
> + .long 0x3f20f000
> + .long 0x3f252000
> + .long 0x3f295000
> + .long 0x3f2d7000
> + /* L2H = log(2)_high. */
> + .align 64
> + .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> + .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> + .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> + .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> + /* L2L = log(2)_low. */
> + .align 64
> + .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> + .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> + .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> + .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> + /* poly_coeff3. */
> + .align 64
> + .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> + .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> + .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> + .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> + /* poly_coeff2. */
> + .align 64
> + .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> + .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> + .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> + .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> + /* poly_coeff1. */
> + .align 64
> + .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> + .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> + .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> + .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> + .align 64
> + .type __svml_satanh_data_internal_avx512, @object
> + .size __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> index 7927e01f0c..e1a8a28a3d 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S
> @@ -28,334 +28,278 @@
> * atanh(-1) = -INF
> * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
> *
> - */
> -
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> -#define SgnMask 0
> -#define sOne 16
> -#define sPoly 32
> -#define iBrkValue 160
> -#define iOffExpoMask 176
> -#define sHalf 192
> -#define sSign 208
> -#define sTopMask12 224
> -#define TinyRange 240
> -#define sLn2 256
> +*/
>
> -#include <sysdep.h>
>
> - .text
> - .section .text.sse4,"ax",@progbits
> -ENTRY(_ZGVbN4v_atanhf_sse4)
> - subq $72, %rsp
> - cfi_def_cfa_offset(80)
> - movaps %xmm0, %xmm5
> -
> -/* Load constants including One = 1 */
> - movups sOne+__svml_satanh_data_internal(%rip), %xmm4
> - movaps %xmm5, %xmm3
> -
> -/* Strip off the sign, so treat X as positive until right at the end */
> - movups SgnMask+__svml_satanh_data_internal(%rip), %xmm7
> - movaps %xmm4, %xmm8
> - andps %xmm5, %xmm7
> - movaps %xmm4, %xmm10
> - movups sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
> - movaps %xmm4, %xmm14
> - movaps %xmm11, %xmm9
> + /* Offsets for data table __svml_satanh_data_internal. */
> +#define sOne 0
> +#define SgnMask 16
> +#define sTopMask12 32
> +#define iBrkValue 48
> +#define iOffExpoMask 64
> +#define sPoly 80
> +#define sLn2 208
> +#define TinyRange 224
>
> -/*
> - * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> - * the upper part UHi being <= 12 bits long. Then we have
> - * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> - */
> - movaps %xmm7, %xmm12
> -
> -/*
> - * Check whether |X| < 1, in which case we use the main function.
> - * Otherwise set the rangemask so that the callout will get used.
> - * Note that this will also use the callout for NaNs since not(NaN < 1).
> - */
> - movaps %xmm7, %xmm6
> - movaps %xmm7, %xmm2
> - cmpnltps %xmm4, %xmm6
> - cmpltps TinyRange+__svml_satanh_data_internal(%rip), %xmm2
> - mulps %xmm5, %xmm3
> - subps %xmm7, %xmm8
> - addps %xmm7, %xmm12
> - movmskps %xmm6, %edx
> - subps %xmm8, %xmm10
> - addps %xmm5, %xmm3
> - subps %xmm7, %xmm10
> - andps %xmm8, %xmm9
> -
> -/*
> - * Now we feed into the log1p code, using H in place of _VARG1 and
> - * later incorporating L into the reduced argument.
> - * compute 1+x as high, low parts
> - */
> - movaps %xmm4, %xmm7
> -
> -/*
> - * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
> - * The first FMR is exact (we force R to 12 bits just in case it
> - * isn't already, to make absolutely sure), and since E is ~ 2^-12,
> - * the rounding error in the other one is acceptable.
> - */
> - rcpps %xmm9, %xmm15
> - subps %xmm9, %xmm8
> - andps %xmm11, %xmm15
>
> -/*
> - * Split V as well into upper 12 bits and lower part, so that we can get
> - * a preliminary quotient estimate without rounding error.
> - */
> - andps %xmm12, %xmm11
> - mulps %xmm15, %xmm9
> - addps %xmm8, %xmm10
> - subps %xmm11, %xmm12
> -
> -/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> - mulps %xmm15, %xmm11
> - mulps %xmm15, %xmm10
> - subps %xmm9, %xmm14
> - mulps %xmm12, %xmm15
> - subps %xmm10, %xmm14
> -
> -/* Compute D = E + E^2 */
> - movaps %xmm14, %xmm13
> - movaps %xmm4, %xmm8
> - mulps %xmm14, %xmm13
> -
> -/* reduction: compute r,n */
> - movdqu iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
> - addps %xmm13, %xmm14
> +#include <sysdep.h>
> +#define TANHF_DATA(x) (x) + __svml_satanh_data_internal
>
> -/*
> - * Compute R * (VHi + VLo) * (1 + E + E^2)
> - * = R * (VHi + VLo) * (1 + D)
> - * = QHi + (QHi * D + QLo + QLo * D)
> - */
> - movaps %xmm14, %xmm0
> - mulps %xmm15, %xmm14
> - mulps %xmm11, %xmm0
> - addps %xmm14, %xmm15
> - movdqu iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
> - movaps %xmm4, %xmm14
> -
> -/* Record the sign for eventual reincorporation. */
> - movups sSign+__svml_satanh_data_internal(%rip), %xmm1
> - addps %xmm15, %xmm0
> + .text
> + .section .text.sse4, "ax", @progbits
> +ENTRY(_ZGVbN4v_atanhf_sse4)
> + movaps %xmm0, %xmm5
> +
> + /* Load constants including One = 1. */
> + movups TANHF_DATA(sOne)(%rip), %xmm4
> + movaps %xmm5, %xmm3
> +
> + /* Strip off the sign, so treat X as positive until right at the end.
> + */
> + movups TANHF_DATA(SgnMask)(%rip), %xmm1
> + movaps %xmm4, %xmm2
> + andps %xmm1, %xmm0
> + movaps %xmm4, %xmm10
> + movups TANHF_DATA(sTopMask12)(%rip), %xmm11
> + movaps %xmm4, %xmm14
> + movaps %xmm11, %xmm9
> +
> +
> + /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> + the upper part UHi being <= 12 bits long. Then we have atanh(X) = 1/2 *
> + log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). */
> + movaps %xmm0, %xmm6
> + mulps %xmm5, %xmm3
> + subps %xmm0, %xmm2
> + addps %xmm0, %xmm6
> + subps %xmm2, %xmm10
> + addps %xmm5, %xmm3
> + subps %xmm0, %xmm10
> + andps %xmm2, %xmm9
> +
> +
> + /* Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E The first
> + FMR is exact (we force R to 12 bits just in case it isn't already, to
> + make absolutely sure), and since E is ~ 2^-12, the rounding error in the
> + other one is acceptable. */
> + rcpps %xmm9, %xmm7
> + subps %xmm9, %xmm2
> + andps %xmm11, %xmm7
> +
> +
> + /* Split V as well into upper 12 bits and lower part, so that we can get
> + a preliminary quotient estimate without rounding error. */
> +
> + andps %xmm6, %xmm11
> + mulps %xmm7, %xmm9
> + addps %xmm2, %xmm10
> + subps %xmm11, %xmm6
> +
> + /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.
> + */
> + mulps %xmm7, %xmm11
> + mulps %xmm7, %xmm10
> + subps %xmm9, %xmm14
> + mulps %xmm6, %xmm7
> + subps %xmm10, %xmm14
> +
> + /* Compute D = E + E^2. */
> + movaps %xmm14, %xmm13
> + movaps %xmm4, %xmm8
> + mulps %xmm14, %xmm13
> +
> + /* reduction: compute r,n. */
> + movdqu TANHF_DATA(iBrkValue)(%rip), %xmm9
> + addps %xmm13, %xmm14
> +
> + /*
> + * Compute R * (VHi + VLo) * (1 + E + E^2)
> + * = R * (VHi + VLo) * (1 + D)
> + * = QHi + (QHi * D + QLo + QLo * D)
> + */
> + movaps %xmm14, %xmm2
> + mulps %xmm7, %xmm14
> + mulps %xmm11, %xmm2
> + addps %xmm14, %xmm7
> + movdqu TANHF_DATA(iOffExpoMask)(%rip), %xmm12
> + movaps %xmm4, %xmm14
> +
> + /* Record the sign for eventual reincorporation. */
> + addps %xmm7, %xmm2
> +
> +
> + /* Now finally accumulate the high and low parts of the argument to
> + log1p, H + L, with a final compensated summation. */
> + movaps %xmm2, %xmm6
> + andnps %xmm5, %xmm1
> + movaps %xmm4, %xmm7
> + /* Or the sign bit in with the tiny result to handle atanh(-0)
> + correctly. */
> + addps %xmm11, %xmm6
> + maxps %xmm6, %xmm7
> + minps %xmm6, %xmm8
> + subps %xmm6, %xmm11
> + movaps %xmm7, %xmm10
> + addps %xmm8, %xmm10
> + addps %xmm11, %xmm2
> + subps %xmm10, %xmm7
> + psubd %xmm9, %xmm10
> + addps %xmm8, %xmm7
> + pand %xmm10, %xmm12
> + psrad $23, %xmm10
> + cvtdq2ps %xmm10, %xmm13
> + addps %xmm7, %xmm2
> +
> + /* final reconstruction. */
> + pslld $23, %xmm10
> + paddd %xmm9, %xmm12
> + psubd %xmm10, %xmm14
> +
> + /* polynomial evaluation. */
> + subps %xmm4, %xmm12
> + mulps %xmm14, %xmm2
> + movups TANHF_DATA(sPoly)(%rip), %xmm7
> + addps %xmm12, %xmm2
> + mulps %xmm2, %xmm7
> +
> +
> + /* Finally, halve the result and reincorporate the sign. */
> + addps TANHF_DATA(sPoly + 16)(%rip), %xmm7
> + mulps %xmm2, %xmm7
> + addps TANHF_DATA(sPoly + 32)(%rip), %xmm7
> + mulps %xmm2, %xmm7
> + addps TANHF_DATA(sPoly + 48)(%rip), %xmm7
> + mulps %xmm2, %xmm7
> + addps TANHF_DATA(sPoly + 64)(%rip), %xmm7
> + mulps %xmm2, %xmm7
> + addps TANHF_DATA(sPoly + 80)(%rip), %xmm7
> + mulps %xmm2, %xmm7
> + addps TANHF_DATA(sPoly + 96)(%rip), %xmm7
> + mulps %xmm2, %xmm7
> + movaps TANHF_DATA(sPoly + 112)(%rip), %xmm6
> + addps %xmm6, %xmm7
> + mulps %xmm2, %xmm7
> + mulps %xmm2, %xmm7
> + mulps TANHF_DATA(sLn2)(%rip), %xmm13
> + /* We can build `sHalf` with `sPoly & sOne`. */
> + andps %xmm4, %xmm6
> + orps %xmm1, %xmm3
> + xorps %xmm6, %xmm1
> +
> + addps %xmm2, %xmm7
> + addps %xmm13, %xmm7
> + mulps %xmm7, %xmm1
> +
> + /* Check whether |X| < 1, in which case we use the main function.
> + Otherwise set the rangemask so that the callout will get used. Note that
> + this will also use the callout for NaNs since not(NaN < 1). */
> + cmpleps %xmm0, %xmm4
> + movmskps %xmm4, %edx
> + cmpltps TANHF_DATA(TinyRange)(%rip), %xmm0
> +
> + andps %xmm0, %xmm3
> + andnps %xmm1, %xmm0
> + orps %xmm3, %xmm0
> +
> + testl %edx, %edx
> + /* Go to special inputs processing branch. */
> + jne L(SPECIAL_VALUES_BRANCH)
> +
> + /* No registers to restore on fast path. */
> + ret
> +
> + /* Branch to process special inputs. */
> +L(SPECIAL_VALUES_BRANCH):
> + subq $56, %rsp
>
> -/*
> - * Now finally accumulate the high and low parts of the
> - * argument to log1p, H + L, with a final compensated summation.
> - */
> - movaps %xmm0, %xmm6
> - andps %xmm5, %xmm1
> -
> -/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> - orps %xmm1, %xmm3
> - addps %xmm11, %xmm6
> - maxps %xmm6, %xmm7
> - minps %xmm6, %xmm8
> - subps %xmm6, %xmm11
> - movaps %xmm7, %xmm10
> - andps %xmm2, %xmm3
> - addps %xmm8, %xmm10
> - addps %xmm11, %xmm0
> - subps %xmm10, %xmm7
> - psubd %xmm9, %xmm10
> - addps %xmm7, %xmm8
> - pand %xmm10, %xmm12
> - psrad $23, %xmm10
> - cvtdq2ps %xmm10, %xmm13
> - addps %xmm8, %xmm0
> -
> -/* final reconstruction */
> - mulps sLn2+__svml_satanh_data_internal(%rip), %xmm13
> - pslld $23, %xmm10
> - paddd %xmm9, %xmm12
> - psubd %xmm10, %xmm14
> -
> -/* polynomial evaluation */
> - subps %xmm4, %xmm12
> - mulps %xmm0, %xmm14
> - movups sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
> - addps %xmm12, %xmm14
> - mulps %xmm14, %xmm0
> -
> -/* Finally, halve the result and reincorporate the sign */
> - movups sHalf+__svml_satanh_data_internal(%rip), %xmm4
> - pxor %xmm1, %xmm4
> - addps sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
> - mulps %xmm14, %xmm0
> - addps sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
> - mulps %xmm14, %xmm0
> - addps sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
> - mulps %xmm14, %xmm0
> - addps sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
> - mulps %xmm14, %xmm0
> - addps sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
> - mulps %xmm14, %xmm0
> - addps sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
> - mulps %xmm14, %xmm0
> - addps sPoly+__svml_satanh_data_internal(%rip), %xmm0
> - mulps %xmm14, %xmm0
> - mulps %xmm14, %xmm0
> - addps %xmm0, %xmm14
> - movaps %xmm2, %xmm0
> - addps %xmm13, %xmm14
> - mulps %xmm14, %xmm4
> - andnps %xmm4, %xmm0
> - orps %xmm3, %xmm0
> - testl %edx, %edx
> -
> -/* Go to special inputs processing branch */
> - jne L(SPECIAL_VALUES_BRANCH)
> - # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
> -
> -/* Restore registers
> - * and exit the function
> - */
> -
> -L(EXIT):
> - addq $72, %rsp
> - cfi_def_cfa_offset(8)
> - ret
> - cfi_def_cfa_offset(80)
> -
> -/* Branch to process
> - * special inputs
> - */
> + movups %xmm5, (%rsp)
> + movups %xmm0, 16(%rsp)
>
> -L(SPECIAL_VALUES_BRANCH):
> - movups %xmm5, 32(%rsp)
> - movups %xmm0, 48(%rsp)
> - # LOE rbx rbp r12 r13 r14 r15 edx
> -
> - xorl %eax, %eax
> - movq %r12, 16(%rsp)
> - cfi_offset(12, -64)
> - movl %eax, %r12d
> - movq %r13, 8(%rsp)
> - cfi_offset(13, -72)
> - movl %edx, %r13d
> - movq %r14, (%rsp)
> - cfi_offset(14, -80)
> - # LOE rbx rbp r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> -
> -L(RANGEMASK_CHECK):
> - btl %r12d, %r13d
> -
> -/* Call scalar math function */
> - jc L(SCALAR_MATH_CALL)
> - # LOE rbx rbp r15 r12d r13d
> -
> -/* Special inputs
> - * processing loop
> - */
> + movq %r12, 32(%rsp)
> + movq %r13, 40(%rsp)
>
> + /* edx has 1s where there was a special value that needs to be handled
> + by a tanhf call. */
> + movl %edx, %r13d
> L(SPECIAL_VALUES_LOOP):
> - incl %r12d
> - cmpl $4, %r12d
> -
> -/* Check bits in range mask */
> - jl L(RANGEMASK_CHECK)
> - # LOE rbx rbp r15 r12d r13d
> -
> - movq 16(%rsp), %r12
> - cfi_restore(12)
> - movq 8(%rsp), %r13
> - cfi_restore(13)
> - movq (%rsp), %r14
> - cfi_restore(14)
> - movups 48(%rsp), %xmm0
> -
> -/* Go to exit */
> - jmp L(EXIT)
> - cfi_offset(12, -64)
> - cfi_offset(13, -72)
> - cfi_offset(14, -80)
> - # LOE rbx rbp r12 r13 r14 r15 xmm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> -
> -L(SCALAR_MATH_CALL):
> - movl %r12d, %r14d
> - movss 32(%rsp,%r14,4), %xmm0
> - call atanhf@PLT
> - # LOE rbx rbp r14 r15 r12d r13d xmm0
> -
> - movss %xmm0, 48(%rsp,%r14,4)
> -
> -/* Process special inputs in loop */
> - jmp L(SPECIAL_VALUES_LOOP)
> - # LOE rbx rbp r15 r12d r13d
> + /* use r12 as index for special value that is saved across calls to
> + tanhf. We technically don't need a callee save register here as offset
> + to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> + Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> + in the loop. */
> + xorl %r12d, %r12d
> + bsfl %r13d, %r12d
> +
> + /* Scalar math fucntion call to process special input. */
> + movss (%rsp, %r12, 4), %xmm0
> + call atanhf@PLT
> + /* No good way to avoid the store-forwarding fault this will cause on
> + return. `lfence` avoids the SF fault but at greater cost as it
> + serialized stack/callee save restoration. */
> + movss %xmm0, 16(%rsp, %r12, 4)
> +
> + leal -1(%r13), %eax
> + andl %eax, %r13d
> + jnz L(SPECIAL_VALUES_LOOP)
> +
> + /* All results have been written to 16(%rsp). */
> + vmovups 16(%rsp), %xmm0
> + movq 32(%rsp), %r12
> + movq 40(%rsp), %r13
> + addq $56, %rsp
> + ret
> END(_ZGVbN4v_atanhf_sse4)
>
> - .section .rodata, "a"
> - .align 16
> + .section .rodata, "a"
> + .align 16
>
> #ifdef __svml_satanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> - __declspec(align(16)) VUINT32 SgnMask[4][1];
> - __declspec(align(16)) VUINT32 sOne[4][1];
> - __declspec(align(16)) VUINT32 sPoly[8][4][1];
> - __declspec(align(16)) VUINT32 iBrkValue[4][1];
> - __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
> - __declspec(align(16)) VUINT32 sHalf[4][1];
> - __declspec(align(16)) VUINT32 sSign[4][1];
> - __declspec(align(16)) VUINT32 sTopMask12[4][1];
> - __declspec(align(16)) VUINT32 TinyRange[4][1];
> - __declspec(align(16)) VUINT32 sLn2[4][1];
> -} __svml_satanh_data_internal;
> + typedef unsigned int VUINT32;
> + typedef struct{
> + __declspec (align(16))VUINT32 sOne[4][1];
> + __declspec (align(16))VUINT32 SgnMask[4][1];
> + __declspec (align(16))VUINT32 sTopMask12[4][1];
> + __declspec (align(16))VUINT32 iBrkValue[4][1];
> + __declspec (align(16))VUINT32 iOffExpoMask[4][1];
> + __declspec (align(16))VUINT32 sPoly[8][4][1];
> + __declspec (align(16))VUINT32 sLn2[4][1];
> + __declspec (align(16))VUINT32 TinyRange[4][1];
> + }__svml_satanh_data_internal;
> #endif
> +
> __svml_satanh_data_internal:
> - /*== SgnMask ==*/
> - .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> - /*== sOne = SP 1.0 ==*/
> - .align 16
> - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> - /*== sPoly[] = SP polynomial ==*/
> - .align 16
> - .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> - .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> - .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> - .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> - .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> - .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> - .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> - .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> - /*== iBrkValue = SP 2/3 ==*/
> - .align 16
> - .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> - /*== iOffExpoMask = SP significand mask ==*/
> - .align 16
> - .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> - /*== sHalf ==*/
> - .align 16
> - .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> - /*== sSign ==*/
> - .align 16
> - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
> - /*== sTopMask12 ==*/
> - .align 16
> - .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> - /*== TinyRange ==*/
> - .align 16
> - .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> - /*== sLn2 = SP ln(2) ==*/
> - .align 16
> - .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> - .align 16
> - .type __svml_satanh_data_internal,@object
> - .size __svml_satanh_data_internal,.-__svml_satanh_data_internal
> + /* sOne = SP 1.0. */
> + .align 16
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> + /* SgnMask. */
> + .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> + /* sTopMask12. */
> + .align 16
> + .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> + /* iBrkValue = SP 2/3. */
> + .align 16
> + .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> + /* iOffExpoMask = SP significand mask. */
> + .align 16
> + .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> +
> + /* sPoly[] = SP polynomial. */
> + .align 16
> + .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7. */
> + .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6. */
> + .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5. */
> + .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4. */
> + .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3. */
> + .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2. */
> + .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1. */
> + .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0. */
> +
> + /* sLn2 = SP ln(2). */
> + .align 16
> + .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> + /* TinyRange. */
> + .align 16
> + .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> + .align 16
> + .type __svml_satanh_data_internal, @object
> + .size __svml_satanh_data_internal, .-__svml_satanh_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> index e67fb5dc92..982029e648 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> @@ -30,306 +30,267 @@
> *
> */
>
> -/* Offsets for data table __svml_satanh_data_internal
> - */
> -#define SgnMask 0
> -#define sOne 32
> -#define sPoly 64
> -#define iBrkValue 320
> -#define iOffExpoMask 352
> -#define sHalf 384
> -#define sSign 416
> -#define sTopMask12 448
> -#define TinyRange 480
> -#define sLn2 512
> +
> + /* Offsets for data table __svml_satanh_data_internal. */
> +#define SgnMask 0
> +#define sOne 32
> +#define sTopMask12 64
> +#define TinyRange 96
> +#define iBrkValue 128
> +#define iOffExpoMask 160
> +#define sPoly 192
> +#define sLn2 448
> +#define sHalf 480
>
> #include <sysdep.h>
> +#define TANHF_DATA(x) (x) + __svml_satanh_data_internal
>
> - .text
> - .section .text.avx2,"ax",@progbits
> + .text
> + .section .text.avx2, "ax", @progbits
> ENTRY(_ZGVdN8v_atanhf_avx2)
> - pushq %rbp
> - cfi_def_cfa_offset(16)
> - movq %rsp, %rbp
> - cfi_def_cfa(6, 16)
> - cfi_offset(6, -16)
> - andq $-32, %rsp
> - subq $96, %rsp
> -
> -/* Load constants including One = 1 */
> - vmovups sOne+__svml_satanh_data_internal(%rip), %ymm5
> - vmovups sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
> - vmovaps %ymm0, %ymm6
> -
> -/* Strip off the sign, so treat X as positive until right at the end */
> - vandps SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
> - vsubps %ymm10, %ymm5, %ymm1
> -
> -/*
> - * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> - * the upper part UHi being <= 12 bits long. Then we have
> - * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
> - */
> - vaddps %ymm10, %ymm10, %ymm14
> -
> -/*
> - * Check whether |X| < 1, in which case we use the main function.
> - * Otherwise set the rangemask so that the callout will get used.
> - * Note that this will also use the callout for NaNs since not(NaN < 1).
> - */
> - vcmpnlt_uqps %ymm5, %ymm10, %ymm7
> - vsubps %ymm1, %ymm5, %ymm9
> - vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
> - vrcpps %ymm1, %ymm11
> - vsubps %ymm10, %ymm9, %ymm12
> - vandps %ymm13, %ymm11, %ymm0
> -
> -/* No need to split sU when FMA is available */
> - vfnmadd213ps %ymm5, %ymm0, %ymm1
> - vmovaps %ymm6, %ymm8
> - vfmadd213ps %ymm6, %ymm6, %ymm8
> - vfnmadd231ps %ymm0, %ymm12, %ymm1
> -
> -/*
> - * Split V as well into upper 12 bits and lower part, so that we can get
> - * a preliminary quotient estimate without rounding error.
> - */
> - vandps %ymm13, %ymm14, %ymm15
> - vmovmskps %ymm7, %edx
> - vsubps %ymm15, %ymm14, %ymm7
> -
> -/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
> - vmulps %ymm15, %ymm0, %ymm10
> -
> -/* Compute D = E + E^2 */
> - vfmadd213ps %ymm1, %ymm1, %ymm1
> -
> -/* Record the sign for eventual reincorporation. */
> - vandps sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
> -
> -/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
> - vorps %ymm3, %ymm8, %ymm2
> - vmulps %ymm7, %ymm0, %ymm8
> -
> -/*
> - * Compute R * (VHi + VLo) * (1 + E + E^2)
> - * = R * (VHi + VLo) * (1 + D)
> - * = QHi + (QHi * D + QLo + QLo * D)
> - */
> - vmulps %ymm1, %ymm10, %ymm9
> - vfmadd213ps %ymm8, %ymm8, %ymm1
> - vaddps %ymm1, %ymm9, %ymm1
> -
> -/* reduction: compute r,n */
> - vmovups iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
> -
> -/*
> - * Now finally accumulate the high and low parts of the
> - * argument to log1p, H + L, with a final compensated summation.
> - */
> - vaddps %ymm1, %ymm10, %ymm12
> - vsubps %ymm12, %ymm10, %ymm11
> -
> -/*
> - * Now we feed into the log1p code, using H in place of _VARG1 and
> - * later incorporating L into the reduced argument.
> - * compute 1+x as high, low parts
> - */
> - vmaxps %ymm12, %ymm5, %ymm13
> - vminps %ymm12, %ymm5, %ymm14
> - vaddps %ymm11, %ymm1, %ymm0
> - vaddps %ymm14, %ymm13, %ymm1
> - vpsubd %ymm9, %ymm1, %ymm7
> - vsubps %ymm1, %ymm13, %ymm15
> - vpsrad $23, %ymm7, %ymm10
> - vpand iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
> - vaddps %ymm15, %ymm14, %ymm13
> - vpslld $23, %ymm10, %ymm11
> - vpaddd %ymm9, %ymm8, %ymm15
> - vaddps %ymm13, %ymm0, %ymm14
> - vcvtdq2ps %ymm10, %ymm0
> - vpsubd %ymm11, %ymm5, %ymm12
> -
> -/* polynomial evaluation */
> - vsubps %ymm5, %ymm15, %ymm5
> - vmulps %ymm14, %ymm12, %ymm1
> - vaddps %ymm5, %ymm1, %ymm5
> - vmovups sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
> - vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> - vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> - vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> - vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> - vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> - vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> - vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
> - vmulps %ymm1, %ymm5, %ymm7
> - vfmadd213ps %ymm5, %ymm5, %ymm7
> -
> -/* final reconstruction */
> - vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
> -
> -/* Finally, halve the result and reincorporate the sign */
> - vxorps sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
> - vmulps %ymm0, %ymm3, %ymm0
> - vblendvps %ymm4, %ymm2, %ymm0, %ymm0
> - testl %edx, %edx
> -
> -/* Go to special inputs processing branch */
> - jne L(SPECIAL_VALUES_BRANCH)
> - # LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
> -
> -/* Restore registers
> - * and exit the function
> - */
> -
> -L(EXIT):
> - movq %rbp, %rsp
> - popq %rbp
> - cfi_def_cfa(7, 8)
> - cfi_restore(6)
> - ret
> - cfi_def_cfa(6, 16)
> - cfi_offset(6, -16)
> -
> -/* Branch to process
> - * special inputs
> - */
> -
> + /* Strip off the sign, so treat X as positive until right at the end.
> + */
> + vmovaps TANHF_DATA(SgnMask)(%rip), %ymm2
> + vandps %ymm2, %ymm0, %ymm3
> + /* Load constants including One = 1. */
> + vmovups TANHF_DATA(sOne)(%rip), %ymm5
> + vsubps %ymm3, %ymm5, %ymm1
> + vmovups TANHF_DATA(sTopMask12)(%rip), %ymm4
> +
> + vrcpps %ymm1, %ymm7
> + vsubps %ymm1, %ymm5, %ymm9
> + vandps %ymm4, %ymm7, %ymm6
> + vsubps %ymm3, %ymm9, %ymm7
> +
> + /* No need to split sU when FMA is available. */
> + vfnmadd213ps %ymm5, %ymm6, %ymm1
> + vmovaps %ymm0, %ymm8
> + vfmadd213ps %ymm0, %ymm0, %ymm0
> + vfnmadd231ps %ymm6, %ymm7, %ymm1
> +
> + /* Check whether |X| < 1, in which case we use the main function.
> + Otherwise set the rangemask so that the callout will get used. Note that
> + this will also use the callout for NaNs since not(NaN < 1). */
> + vcmpnlt_uqps %ymm5, %ymm3, %ymm14
> + vcmplt_oqps TANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
> +
> + /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
> + the upper part UHi being <= 12 bits long. Then we have atanh(X) = 1/2 *
> + log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). */
> + vaddps %ymm3, %ymm3, %ymm3
> +
> + /* Split V as well into upper 12 bits and lower part, so that we can get
> + a preliminary quotient estimate without rounding error. */
> + vandps %ymm4, %ymm3, %ymm4
> + vsubps %ymm4, %ymm3, %ymm7
> +
> + /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.
> + */
> + vmulps %ymm4, %ymm6, %ymm4
> +
> + /* Compute D = E + E^2. */
> + vfmadd213ps %ymm1, %ymm1, %ymm1
> +
> + /* Record the sign for eventual reincorporation. */
> + vandnps %ymm8, %ymm2, %ymm3
> +
> + /* Or the sign bit in with the tiny result to handle atanh(-0)
> + correctly. */
> + vorps %ymm3, %ymm0, %ymm13
> + vmulps %ymm7, %ymm6, %ymm2
> +
> + /*
> + Compute R * (VHi + VLo) * (1 + E + E^2)
> + = R * (VHi + VLo) * (1 + D)
> + = QHi + (QHi * D + QLo + QLo * D)
> + */
> + /* If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
> + vaddps %ymm1, %ymm9, %ymm1` can be replaced with `vfmadd231ps %ymm1,
> + %ymm4, %ymm4`. */
> + vmulps %ymm1, %ymm4, %ymm6
> + vfmadd213ps %ymm2, %ymm2, %ymm1
> + vaddps %ymm1, %ymm6, %ymm1
> +
> + /* Now finally accumulate the high and low parts of the argument to
> + log1p, H + L, with a final compensated summation. */
> + vaddps %ymm1, %ymm4, %ymm2
> +
> + /* reduction: compute r,n. */
> + vmovups TANHF_DATA(iBrkValue)(%rip), %ymm9
> +
> + /* Now we feed into the log1p code, using H in place of _VARG1 and later
> + incorporating L into the reduced argument. compute 1+x as high, low
> + parts. */
> + vmaxps %ymm2, %ymm5, %ymm0
> + vminps %ymm2, %ymm5, %ymm6
> +
> + /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`). */
> + vsubps %ymm2, %ymm4, %ymm2
> + vaddps %ymm6, %ymm0, %ymm4
> + vpsubd %ymm9, %ymm4, %ymm7
> + vsubps %ymm4, %ymm0, %ymm4
> + vaddps %ymm2, %ymm1, %ymm2
> + vmovaps TANHF_DATA(iOffExpoMask)(%rip), %ymm1
> +
> + vandps %ymm1, %ymm7, %ymm0
> + vaddps %ymm4, %ymm6, %ymm4
> + vandnps %ymm7, %ymm1, %ymm6
> + vmovups TANHF_DATA(sPoly)(%rip), %ymm1
> + vpaddd %ymm9, %ymm0, %ymm0
> + vaddps %ymm4, %ymm2, %ymm4
> + vpsubd %ymm6, %ymm5, %ymm6
> +
> + /* polynomial evaluation. */
> + vsubps %ymm5, %ymm0, %ymm2
> + vfmadd231ps %ymm4, %ymm6, %ymm2
> + vfmadd213ps TANHF_DATA(sPoly + 32)(%rip), %ymm2, %ymm1
> + vfmadd213ps TANHF_DATA(sPoly + 64)(%rip), %ymm2, %ymm1
> + vfmadd213ps TANHF_DATA(sPoly + 96)(%rip), %ymm2, %ymm1
> + vfmadd213ps TANHF_DATA(sPoly + 128)(%rip), %ymm2, %ymm1
> + vfmadd213ps TANHF_DATA(sPoly + 160)(%rip), %ymm2, %ymm1
> + vfmadd213ps TANHF_DATA(sPoly + 192)(%rip), %ymm2, %ymm1
> + vfmadd213ps TANHF_DATA(sPoly + 224)(%rip), %ymm2, %ymm1
> +
> + vmulps %ymm1, %ymm2, %ymm1
> + vfmadd213ps %ymm2, %ymm2, %ymm1
> +
> + /* final reconstruction. */
> + vpsrad $23, %ymm7, %ymm6
> + vcvtdq2ps %ymm6, %ymm2
> + vfmadd132ps TANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
> +
> + /* Finally, halve the result and reincorporate the sign. */
> + vxorps TANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
> + vmulps %ymm2, %ymm3, %ymm2
> + vmovmskps %ymm14, %edx
> + testl %edx, %edx
> +
> + vblendvps %ymm15, %ymm13, %ymm2, %ymm0
> + /* Go to special inputs processing branch. */
> + jne L(SPECIAL_VALUES_BRANCH)
> +
> + /* No registers to restore on fast path. */
> + ret
> +
> +
> + /* Branch to process special inputs. */
> L(SPECIAL_VALUES_BRANCH):
> - vmovups %ymm6, 32(%rsp)
> - vmovups %ymm0, 64(%rsp)
> - # LOE rbx r12 r13 r14 r15 edx ymm0
> -
> - xorl %eax, %eax
> - # LOE rbx r12 r13 r14 r15 eax edx
> -
> - vzeroupper
> - movq %r12, 16(%rsp)
> - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> - movl %eax, %r12d
> - movq %r13, 8(%rsp)
> - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> - movl %edx, %r13d
> - movq %r14, (%rsp)
> - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> - # LOE rbx r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> -
> -L(RANGEMASK_CHECK):
> - btl %r12d, %r13d
> -
> -/* Call scalar math function */
> - jc L(SCALAR_MATH_CALL)
> - # LOE rbx r15 r12d r13d
> -
> -/* Special inputs
> - * processing loop
> - */
> -
> + pushq %rbp
> + /* Need to callee save registers to preserve state across tanhf calls.
> + */
> + pushq %r12
> + pushq %r13
> + movq %rsp, %rbp
> +
> + /* Align stack and make room for 2x ymm vectors. */
> + andq $-32, %rsp
> + addq $-64, %rsp
> +
> + /* Save origional input (ymm0 unchanged up to this point). */
> + vmovups %ymm0, (%rsp)
> + /* Save all already computed inputs. */
> + vmovups %ymm8, 32(%rsp)
> +
> + vzeroupper
> +
> + /* edx has 1s where there was a special value that needs to be handled
> + by a tanhf call. */
> + movl %edx, %r13d
> L(SPECIAL_VALUES_LOOP):
> - incl %r12d
> - cmpl $8, %r12d
> -
> -/* Check bits in range mask */
> - jl L(RANGEMASK_CHECK)
> - # LOE rbx r15 r12d r13d
> -
> - movq 16(%rsp), %r12
> - cfi_restore(12)
> - movq 8(%rsp), %r13
> - cfi_restore(13)
> - movq (%rsp), %r14
> - cfi_restore(14)
> - vmovups 64(%rsp), %ymm0
> -
> -/* Go to exit */
> - jmp L(EXIT)
> - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
> - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> - # LOE rbx r12 r13 r14 r15 ymm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> -
> -L(SCALAR_MATH_CALL):
> - movl %r12d, %r14d
> - movss 32(%rsp,%r14,4), %xmm0
> - call atanhf@PLT
> - # LOE rbx r14 r15 r12d r13d xmm0
> -
> - movss %xmm0, 64(%rsp,%r14,4)
> -
> -/* Process special inputs in loop */
> - jmp L(SPECIAL_VALUES_LOOP)
> - # LOE rbx r15 r12d r13d
> + /* use r12 as index for special value that is saved across calls to
> + tanhf. We technically don't need a callee save register here as offset
> + to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> + Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> + in the loop. */
> + xorl %r12d, %r12d
> + tzcntl %r13d, %r12d
> +
> + /* Scalar math fucntion call to process special input. */
> + movss 32(%rsp, %r12, 4), %xmm0
> + call atanhf@PLT
> + /* No good way to avoid the store-forwarding fault this will cause on
> + return. `lfence` avoids the SF fault but at greater cost as it
> + serialized stack/callee save restoration. */
> + movss %xmm0, (%rsp, %r12, 4)
> +
> + blsr %r13d, %r13d
> + jnz L(SPECIAL_VALUES_LOOP)
> +
> + /* All results have been written to 32(%rsp). */
> + vmovups (%rsp), %ymm0
> + movq %rbp, %rsp
> + popq %r13
> + popq %r12
> + popq %rbp
> + ret
> END(_ZGVdN8v_atanhf_avx2)
>
> - .section .rodata, "a"
> - .align 32
> -
> + .section .rodata, "a"
> + .align 32
> #ifdef __svml_satanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> - __declspec(align(32)) VUINT32 SgnMask[8][1];
> - __declspec(align(32)) VUINT32 sOne[8][1];
> - __declspec(align(32)) VUINT32 sPoly[8][8][1];
> - __declspec(align(32)) VUINT32 iBrkValue[8][1];
> - __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
> - __declspec(align(32)) VUINT32 sHalf[8][1];
> - __declspec(align(32)) VUINT32 sSign[8][1];
> - __declspec(align(32)) VUINT32 sTopMask12[8][1];
> - __declspec(align(32)) VUINT32 TinyRange[8][1];
> - __declspec(align(32)) VUINT32 sLn2[8][1];
> -} __svml_satanh_data_internal;
> + typedef unsigned int VUINT32;
> + typedef struct{
> + __declspec (align(32))VUINT32 SgnMask[8][1];
> + __declspec (align(32))VUINT32 sOne[8][1];
> + __declspec (align(32))VUINT32 sTopMask12[8][1];
> + __declspec (align(32))VUINT32 TinyRange[8][1];
> + __declspec (align(32))VUINT32 iBrkValue[8][1];
> + __declspec (align(32))VUINT32 iOffExpoMask[8][1];
> + __declspec (align(32))VUINT32 sPoly[8][8][1];
> + __declspec (align(32))VUINT32 sLn2[8][1];
> + __declspec (align(32))VUINT32 sHalf[8][1];
> + }__svml_satanh_data_internal;
> #endif
> __svml_satanh_data_internal:
> - /*== SgnMask ==*/
> - .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> - /*== sOne = SP 1.0 ==*/
> - .align 32
> - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> - /*== sPoly[] = SP polynomial ==*/
> - .align 32
> - .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
> - .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
> - .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
> - .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
> - .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
> - .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
> - .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
> - .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
> - /*== iBrkValue = SP 2/3 ==*/
> - .align 32
> - .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> - /*== iOffExpoMask = SP significand mask ==*/
> - .align 32
> - .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> - /*== sHalf ==*/
> - .align 32
> - .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> - /*== sSign ==*/
> - .align 32
> - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> - /*== sTopMask12 ==*/
> - .align 32
> - .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> - /*== TinyRange ==*/
> - .align 32
> - .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> - /*== sLn2 = SP ln(2) ==*/
> - .align 32
> - .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> - .align 32
> - .type __svml_satanh_data_internal,@object
> - .size __svml_satanh_data_internal,.-__svml_satanh_data_internal
> + /* SgnMask. */
> + .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> + .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> + /* sOne = SP 1.0. */
> + .align 32
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> + .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> + /* sTopMask12. */
> + .align 32
> + .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> + .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
> + /* TinyRange. */
> + .align 32
> + .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> + .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
> + /* iBrkValue = SP 2/3. */
> + .align 32
> + .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> + .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
> + /* iOffExpoMask = SP significand mask. */
> + .align 32
> + .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> + .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
> + /* sPoly[] = SP polynomial. */
> + .align 32
> + .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
> + .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7. */
> + .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
> + .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6. */
> + .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
> + .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5. */
> + .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
> + .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4. */
> + .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
> + .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3. */
> + .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
> + .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2. */
> + .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
> + .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1. */
> + .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> + .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0. */
> + /* sLn2 = SP ln(2). */
> + .align 32
> + .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> + .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
> + /* sHalf. */
> + .align 32
> + .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> + .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
> + .align 32
> + .type __svml_satanh_data_internal, @object
> + .size __svml_satanh_data_internal, .-__svml_satanh_data_internal
> --
> 2.25.1
>
@@ -31,363 +31,343 @@
*
*/
-/* Offsets for data table __svml_satanh_data_internal_avx512
- */
-#define Log_tbl_H 0
-#define Log_tbl_L 128
-#define One 256
-#define AbsMask 320
-#define AddB5 384
-#define RcpBitMask 448
-#define poly_coeff3 512
-#define poly_coeff2 576
-#define poly_coeff1 640
-#define poly_coeff0 704
-#define Half 768
-#define L2H 832
-#define L2L 896
+
+ /* Offsets for data table __svml_satanh_data_internal_avx512. */
+#define AbsMask 0
+#define One 64
+#define AddB5 128
+#define RcpBitMask 192
+#define Log_tbl_L_lo 256
+#define Log_tbl_L_hi 320
+#define Log_tbl_H_lo 384
+#define Log_tbl_H_hi 448
+#define L2H 512
+#define L2L 576
+#define poly_coeff3 640
+#define poly_coeff2 704
+#define poly_coeff1 768
#include <sysdep.h>
+#define TANHF_DATA(x) (x) + __svml_satanh_data_internal_avx512
- .text
- .section .text.exex512,"ax",@progbits
+ .text
+ .section .text.exex512, "ax", @progbits
ENTRY(_ZGVeN16v_atanhf_skx)
- pushq %rbp
- cfi_def_cfa_offset(16)
- movq %rsp, %rbp
- cfi_def_cfa(6, 16)
- cfi_offset(6, -16)
- andq $-64, %rsp
- subq $192, %rsp
- vmovups One+__svml_satanh_data_internal_avx512(%rip), %zmm4
-
-/* round reciprocals to 1+5b mantissas */
- vmovups AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
- vmovups RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
- vmovaps %zmm0, %zmm11
- vandps AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
-
-/* 1+y */
- vaddps {rn-sae}, %zmm4, %zmm6, %zmm9
-
-/* 1-y */
- vsubps {rn-sae}, %zmm6, %zmm4, %zmm8
- vxorps %zmm6, %zmm11, %zmm10
-
-/* Yp_high */
- vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
-
-/* -Ym_high */
- vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
-
-/* RcpP ~ 1/Yp */
- vrcp14ps %zmm9, %zmm12
-
-/* RcpM ~ 1/Ym */
- vrcp14ps %zmm8, %zmm13
-
-/* input outside (-1, 1) ? */
- vcmpps $21, {sae}, %zmm4, %zmm6, %k0
- vpaddd %zmm14, %zmm12, %zmm15
- vpaddd %zmm14, %zmm13, %zmm0
-
-/* Yp_low */
- vsubps {rn-sae}, %zmm2, %zmm6, %zmm3
- vandps %zmm1, %zmm15, %zmm7
- vandps %zmm1, %zmm0, %zmm12
-
-/* Ym_low */
- vaddps {rn-sae}, %zmm5, %zmm6, %zmm5
-
-/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
- vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
-
-/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
- vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
- vmovups Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
- vmovups Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
-
-/* exponents */
- vgetexpps {sae}, %zmm7, %zmm15
- vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
-
-/* Table lookups */
- vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
- vgetexpps {sae}, %zmm12, %zmm14
- vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
-
-/* Prepare table index */
- vpsrld $18, %zmm7, %zmm3
- vpsrld $18, %zmm12, %zmm2
- vmovups Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
- vmovups poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
-
-/* Km-Kp */
- vsubps {rn-sae}, %zmm15, %zmm14, %zmm1
- kmovw %k0, %edx
- vmovaps %zmm3, %zmm0
- vpermi2ps %zmm13, %zmm8, %zmm3
- vpermt2ps %zmm13, %zmm2, %zmm8
- vpermi2ps %zmm7, %zmm6, %zmm0
- vpermt2ps %zmm7, %zmm2, %zmm6
- vsubps {rn-sae}, %zmm3, %zmm8, %zmm5
-
-/* K*L2H + Th */
- vmovups L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
-
-/* K*L2L + Tl */
- vmovups L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
-
-/* polynomials */
- vmovups poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
- vmovups poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
-
-/* table values */
- vsubps {rn-sae}, %zmm0, %zmm6, %zmm0
- vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
- vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
- vmovups poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
- vmovaps %zmm3, %zmm2
- vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
- vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
- vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
- vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
- vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
- vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
-
-/* (K*L2L + Tl) + Rp*PolyP */
- vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
- vorps Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
-
-/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
- vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
- vaddps {rn-sae}, %zmm3, %zmm0, %zmm4
- vmulps {rn-sae}, %zmm9, %zmm4, %zmm0
- testl %edx, %edx
-
-/* Go to special inputs processing branch */
- jne L(SPECIAL_VALUES_BRANCH)
- # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
-
-/* Restore registers
- * and exit the function
- */
+ vandps TANHF_DATA(AbsMask)(%rip), %zmm0, %zmm6
+ vmovups TANHF_DATA(One)(%rip), %zmm4
-L(EXIT):
- movq %rbp, %rsp
- popq %rbp
- cfi_def_cfa(7, 8)
- cfi_restore(6)
- ret
- cfi_def_cfa(6, 16)
- cfi_offset(6, -16)
-
-/* Branch to process
- * special inputs
- */
+ /* 1+y. */
+ vaddps {rn-sae}, %zmm4, %zmm6, %zmm9
-L(SPECIAL_VALUES_BRANCH):
- vmovups %zmm11, 64(%rsp)
- vmovups %zmm0, 128(%rsp)
- # LOE rbx r12 r13 r14 r15 edx zmm0
-
- xorl %eax, %eax
- # LOE rbx r12 r13 r14 r15 eax edx
-
- vzeroupper
- movq %r12, 16(%rsp)
- /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
- .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
- movl %eax, %r12d
- movq %r13, 8(%rsp)
- /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
- .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
- movl %edx, %r13d
- movq %r14, (%rsp)
- /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
- .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
- # LOE rbx r15 r12d r13d
-
-/* Range mask
- * bits check
- */
+ /* 1-y. */
+ vsubps {rn-sae}, %zmm6, %zmm4, %zmm8
-L(RANGEMASK_CHECK):
- btl %r12d, %r13d
+ /* round reciprocals to 1+5b mantissas. */
+ vmovups TANHF_DATA(AddB5)(%rip), %zmm14
+ vmovups TANHF_DATA(RcpBitMask)(%rip), %zmm1
-/* Call scalar math function */
- jc L(SCALAR_MATH_CALL)
- # LOE rbx r15 r12d r13d
+ /* RcpP ~ 1/Yp. */
+ vrcp14ps %zmm9, %zmm12
-/* Special inputs
- * processing loop
- */
+ /* RcpM ~ 1/Ym. */
+ vrcp14ps %zmm8, %zmm13
-L(SPECIAL_VALUES_LOOP):
- incl %r12d
- cmpl $16, %r12d
-
-/* Check bits in range mask */
- jl L(RANGEMASK_CHECK)
- # LOE rbx r15 r12d r13d
-
- movq 16(%rsp), %r12
- cfi_restore(12)
- movq 8(%rsp), %r13
- cfi_restore(13)
- movq (%rsp), %r14
- cfi_restore(14)
- vmovups 128(%rsp), %zmm0
-
-/* Go to exit */
- jmp L(EXIT)
- /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
- .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
- /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
- .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
- /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
- .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
- # LOE rbx r12 r13 r14 r15 zmm0
-
-/* Scalar math fucntion call
- * to process special input
- */
+ /* Yp_high. */
+ vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
+
+ /* -Ym_high. */
+ vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
+
+
+ /* input outside (-1, 1) ?. */
+ vpaddd %zmm14, %zmm12, %zmm15
+ vpaddd %zmm14, %zmm13, %zmm12
+
+ /* Yp_low. */
+ vsubps {rn-sae}, %zmm2, %zmm6, %zmm3
+ vandps %zmm1, %zmm15, %zmm7
+ vandps %zmm1, %zmm12, %zmm12
+
+ /* Ym_low. */
+ vaddps {rn-sae}, %zmm5, %zmm6, %zmm5
+
+ /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low. */
+ vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
+
+ /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low. */
+ vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
-L(SCALAR_MATH_CALL):
- movl %r12d, %r14d
- movss 64(%rsp,%r14,4), %xmm0
- call atanhf@PLT
- # LOE rbx r14 r15 r12d r13d xmm0
+ vmovups TANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
+ vmovups TANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
- movss %xmm0, 128(%rsp,%r14,4)
+ /* exponents. */
+ vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
+ vgetexpps {sae}, %zmm7, %zmm15
-/* Process special inputs in loop */
- jmp L(SPECIAL_VALUES_LOOP)
- # LOE rbx r15 r12d r13d
+
+ /* Table lookups. */
+ vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
+ vgetexpps {sae}, %zmm12, %zmm14
+
+
+ /* Prepare table index. */
+ vpsrld $18, %zmm7, %zmm3
+ vpsrld $18, %zmm12, %zmm2
+ vmovups TANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
+ vmovups TANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
+ /* Km-Kp. */
+
+ vmovaps %zmm3, %zmm5
+ vpermi2ps %zmm13, %zmm10, %zmm3
+ vpermt2ps %zmm13, %zmm2, %zmm10
+ vpermi2ps %zmm7, %zmm11, %zmm5
+ vpermt2ps %zmm7, %zmm2, %zmm11
+ vsubps {rn-sae}, %zmm15, %zmm14, %zmm1
+ vsubps {rn-sae}, %zmm3, %zmm10, %zmm7
+
+ /* K*L2H + Th. */
+ vmovups TANHF_DATA(L2H)(%rip), %zmm2
+
+ /* K*L2L + Tl. */
+ vmovups TANHF_DATA(L2L)(%rip), %zmm3
+
+ /* table values. */
+ vsubps {rn-sae}, %zmm5, %zmm11, %zmm5
+ vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
+ vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
+ /* polynomials. */
+ vmovups TANHF_DATA(poly_coeff3)(%rip), %zmm7
+ vmovups TANHF_DATA(poly_coeff2)(%rip), %zmm10
+ vmovaps %zmm10, %zmm14
+ // vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
+ vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
+ vmovups TANHF_DATA(poly_coeff1)(%rip), %zmm12
+ vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
+ vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
+ vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
+ vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
+
+ /* (K*L2L + Tl) + Rp*PolyP. */
+ vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
+
+ vandps %zmm12, %zmm4, %zmm12
+ vpternlogq $246, %zmm0, %zmm6, %zmm12
+
+ /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM. */
+ vfnmadd213ps {rn-sae}, %zmm10, %zmm8, %zmm14
+ vaddps {rn-sae}, %zmm14, %zmm5, %zmm8
+
+ vcmpps $21, {sae}, %zmm4, %zmm6, %k0
+ kmovw %k0, %edx
+ testl %edx, %edx
+
+ /* Go to special inputs processing branch. */
+ jne L(SPECIAL_VALUES_BRANCH)
+ vmulps {rn-sae}, %zmm12, %zmm8, %zmm0
+
+ ret
+
+ /* Branch to process special inputs. */
+L(SPECIAL_VALUES_BRANCH):
+ pushq %rbp
+ /* Need to callee save registers to preserve state across tanhf calls.
+ */
+ pushq %r13
+ pushq %r12
+ movq %rsp, %rbp
+
+ /* Align stack and make room for 2x zmm vectors. */
+ andq $-64, %rsp
+ addq $-128, %rsp
+ vmulps {rn-sae}, %zmm12, %zmm8, %zmm1
+ vmovaps %zmm1, (%rsp)
+ vmovaps %zmm0, 64(%rsp)
+
+ vzeroupper
+
+ /* edx has 1s where there was a special value that needs to be handled
+ by a tanhf call. */
+ movl %edx, %r13d
+L(SPECIAL_VALUES_LOOP):
+ /* use r12 as index for special value that is saved across calls to
+ tanhf. We technically don't need a callee save register here as offset
+ to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+ Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+ in the loop. */
+ xorl %r12d, %r12d
+ tzcntl %r13d, %r12d
+
+ /* Scalar math fucntion call to process special input. */
+ movss 64(%rsp, %r12, 4), %xmm0
+ call atanhf@PLT
+
+ /* No good way to avoid the store-forwarding fault this will cause on
+ return. `lfence` avoids the SF fault but at greater cost as it
+ serialized stack/callee save restoration. */
+ movss %xmm0, (%rsp, %r12, 4)
+
+ blsr %r13d, %r13d
+ jnz L(SPECIAL_VALUES_LOOP)
+
+ /* All results have been written to 64(%rsp). */
+ vmovaps (%rsp), %zmm0
+ /* Restore rsp. */
+ movq %rbp, %rsp
+ /* Restore callee save registers. */
+ popq %r12
+ popq %r13
+ popq %rbp
+ ret
END(_ZGVeN16v_atanhf_skx)
- .section .rodata, "a"
- .align 64
+ .section .rodata, "a"
+ .align 64
#ifdef __svml_satanh_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
- __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
- __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
- __declspec(align(64)) VUINT32 One[16][1];
- __declspec(align(64)) VUINT32 AbsMask[16][1];
- __declspec(align(64)) VUINT32 AddB5[16][1];
- __declspec(align(64)) VUINT32 RcpBitMask[16][1];
- __declspec(align(64)) VUINT32 poly_coeff3[16][1];
- __declspec(align(64)) VUINT32 poly_coeff2[16][1];
- __declspec(align(64)) VUINT32 poly_coeff1[16][1];
- __declspec(align(64)) VUINT32 poly_coeff0[16][1];
- __declspec(align(64)) VUINT32 Half[16][1];
- __declspec(align(64)) VUINT32 L2H[16][1];
- __declspec(align(64)) VUINT32 L2L[16][1];
- } __svml_satanh_data_internal_avx512;
+ typedef unsigned int VUINT32;
+ typedef struct{
+ __declspec (align(64))VUINT32 AbsMask[16][1];
+ __declspec (align(64))VUINT32 One[16][1];
+ __declspec (align(64))VUINT32 AddB5[16][1];
+ __declspec (align(64))VUINT32 RcpBitMask[16][1];
+ __declspec (align(64))VUINT32 Log_tbl_L_lo[16][1];
+ __declspec (align(64))VUINT32 Log_tbl_L_hi[16][1];
+ __declspec (align(64))VUINT32 Log_tbl_H_lo[16][1];
+ __declspec (align(64))VUINT32 Log_tbl_H_hi[16][1];
+ __declspec (align(64))VUINT32 L2H[16][1];
+ __declspec (align(64))VUINT32 L2L[16][1];
+ __declspec (align(64))VUINT32 poly_coeff3[16][1];
+ __declspec (align(64))VUINT32 poly_coeff2[16][1];
+ __declspec (align(64))VUINT32 poly_coeff1[16][1];
+ }__svml_satanh_data_internal_avx512;
#endif
__svml_satanh_data_internal_avx512:
- /*== Log_tbl_H ==*/
- .long 0x00000000
- .long 0x3cfc0000
- .long 0x3d780000
- .long 0x3db78000
- .long 0x3df10000
- .long 0x3e14c000
- .long 0x3e300000
- .long 0x3e4a8000
- .long 0x3e648000
- .long 0x3e7dc000
- .long 0x3e8b4000
- .long 0x3e974000
- .long 0x3ea30000
- .long 0x3eae8000
- .long 0x3eb9c000
- .long 0x3ec4e000
- .long 0x3ecfa000
- .long 0x3eda2000
- .long 0x3ee48000
- .long 0x3eeea000
- .long 0x3ef8a000
- .long 0x3f013000
- .long 0x3f05f000
- .long 0x3f0aa000
- .long 0x3f0f4000
- .long 0x3f13d000
- .long 0x3f184000
- .long 0x3f1ca000
- .long 0x3f20f000
- .long 0x3f252000
- .long 0x3f295000
- .long 0x3f2d7000
- /*== Log_tbl_L ==*/
- .align 64
- .long 0x00000000
- .long 0x3726c39e
- .long 0x38a30c01
- .long 0x37528ae5
- .long 0x38e0edc5
- .long 0xb8ab41f8
- .long 0xb7cf8f58
- .long 0x3896a73d
- .long 0xb5838656
- .long 0x380c36af
- .long 0xb8235454
- .long 0x3862bae1
- .long 0x38c5e10e
- .long 0x38dedfac
- .long 0x38ebfb5e
- .long 0xb8e63c9f
- .long 0xb85c1340
- .long 0x38777bcd
- .long 0xb6038656
- .long 0x37d40984
- .long 0xb8b85028
- .long 0xb8ad5a5a
- .long 0x3865c84a
- .long 0x38c3d2f5
- .long 0x383ebce1
- .long 0xb8a1ed76
- .long 0xb7a332c4
- .long 0xb779654f
- .long 0xb8602f73
- .long 0x38f85db0
- .long 0x37b4996f
- .long 0xb8bfb3ca
- /*== One ==*/
- .align 64
- .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
- /*== AbsMask ==*/
- .align 64
- .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
- /*== AddB5 ==*/
- .align 64
- .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
- /*== RcpBitMask ==*/
- .align 64
- .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
- /*== poly_coeff3 ==*/
- .align 64
- .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
- /*== poly_coeff2 ==*/
- .align 64
- .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
- /*== poly_coeff1 ==*/
- .align 64
- .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
- /*== poly_coeff0 ==*/
- .align 64
- .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
- /*== Half ==*/
- .align 64
- .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
- /*== L2H = log(2)_high ==*/
- .align 64
- .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
- /*== L2L = log(2)_low ==*/
- .align 64
- .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
- .align 64
- .type __svml_satanh_data_internal_avx512,@object
- .size __svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
+ /* AbsMask. */
+ .align 64
+ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+ /* One. */
+ .align 64
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+ /* AddB5. */
+ .align 64
+ .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ /* RcpBitMask. */
+ .align 64
+ .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+ .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+ .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+ .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+ /* Log_tbl_L_lo. */
+ .align 64
+ .long 0x00000000
+ .long 0x3726c39e
+ .long 0x38a30c01
+ .long 0x37528ae5
+ .long 0x38e0edc5
+ .long 0xb8ab41f8
+ .long 0xb7cf8f58
+ .long 0x3896a73d
+ .long 0xb5838656
+ .long 0x380c36af
+ .long 0xb8235454
+ .long 0x3862bae1
+ .long 0x38c5e10e
+ .long 0x38dedfac
+ .long 0x38ebfb5e
+ .long 0xb8e63c9f
+ /* Log_tbl_L_hi. */
+ .align 64
+ .long 0xb85c1340
+ .long 0x38777bcd
+ .long 0xb6038656
+ .long 0x37d40984
+ .long 0xb8b85028
+ .long 0xb8ad5a5a
+ .long 0x3865c84a
+ .long 0x38c3d2f5
+ .long 0x383ebce1
+ .long 0xb8a1ed76
+ .long 0xb7a332c4
+ .long 0xb779654f
+ .long 0xb8602f73
+ .long 0x38f85db0
+ .long 0x37b4996f
+ .long 0xb8bfb3ca
+ /* Log_tbl_H_lo. */
+ .align 64
+ .long 0x00000000
+ .long 0x3cfc0000
+ .long 0x3d780000
+ .long 0x3db78000
+ .long 0x3df10000
+ .long 0x3e14c000
+ .long 0x3e300000
+ .long 0x3e4a8000
+ .long 0x3e648000
+ .long 0x3e7dc000
+ .long 0x3e8b4000
+ .long 0x3e974000
+ .long 0x3ea30000
+ .long 0x3eae8000
+ .long 0x3eb9c000
+ .long 0x3ec4e000
+ /* Log_tbl_H_hi. */
+ .align 64
+ .long 0x3ecfa000
+ .long 0x3eda2000
+ .long 0x3ee48000
+ .long 0x3eeea000
+ .long 0x3ef8a000
+ .long 0x3f013000
+ .long 0x3f05f000
+ .long 0x3f0aa000
+ .long 0x3f0f4000
+ .long 0x3f13d000
+ .long 0x3f184000
+ .long 0x3f1ca000
+ .long 0x3f20f000
+ .long 0x3f252000
+ .long 0x3f295000
+ .long 0x3f2d7000
+ /* L2H = log(2)_high. */
+ .align 64
+ .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+ .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+ .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+ .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+ /* L2L = log(2)_low. */
+ .align 64
+ .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+ .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+ .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+ .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+ /* poly_coeff3. */
+ .align 64
+ .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+ .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+ .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+ .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+ /* poly_coeff2. */
+ .align 64
+ .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+ .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+ .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+ .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+ /* poly_coeff1. */
+ .align 64
+ .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+ .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+ .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+ .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+ .align 64
+ .type __svml_satanh_data_internal_avx512, @object
+ .size __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
@@ -28,334 +28,278 @@
* atanh(-1) = -INF
* atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
*
- */
-
-/* Offsets for data table __svml_satanh_data_internal
- */
-#define SgnMask 0
-#define sOne 16
-#define sPoly 32
-#define iBrkValue 160
-#define iOffExpoMask 176
-#define sHalf 192
-#define sSign 208
-#define sTopMask12 224
-#define TinyRange 240
-#define sLn2 256
+*/
-#include <sysdep.h>
- .text
- .section .text.sse4,"ax",@progbits
-ENTRY(_ZGVbN4v_atanhf_sse4)
- subq $72, %rsp
- cfi_def_cfa_offset(80)
- movaps %xmm0, %xmm5
-
-/* Load constants including One = 1 */
- movups sOne+__svml_satanh_data_internal(%rip), %xmm4
- movaps %xmm5, %xmm3
-
-/* Strip off the sign, so treat X as positive until right at the end */
- movups SgnMask+__svml_satanh_data_internal(%rip), %xmm7
- movaps %xmm4, %xmm8
- andps %xmm5, %xmm7
- movaps %xmm4, %xmm10
- movups sTopMask12+__svml_satanh_data_internal(%rip), %xmm11
- movaps %xmm4, %xmm14
- movaps %xmm11, %xmm9
+ /* Offsets for data table __svml_satanh_data_internal. */
+#define sOne 0
+#define SgnMask 16
+#define sTopMask12 32
+#define iBrkValue 48
+#define iOffExpoMask 64
+#define sPoly 80
+#define sLn2 208
+#define TinyRange 224
-/*
- * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
- * the upper part UHi being <= 12 bits long. Then we have
- * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
- */
- movaps %xmm7, %xmm12
-
-/*
- * Check whether |X| < 1, in which case we use the main function.
- * Otherwise set the rangemask so that the callout will get used.
- * Note that this will also use the callout for NaNs since not(NaN < 1).
- */
- movaps %xmm7, %xmm6
- movaps %xmm7, %xmm2
- cmpnltps %xmm4, %xmm6
- cmpltps TinyRange+__svml_satanh_data_internal(%rip), %xmm2
- mulps %xmm5, %xmm3
- subps %xmm7, %xmm8
- addps %xmm7, %xmm12
- movmskps %xmm6, %edx
- subps %xmm8, %xmm10
- addps %xmm5, %xmm3
- subps %xmm7, %xmm10
- andps %xmm8, %xmm9
-
-/*
- * Now we feed into the log1p code, using H in place of _VARG1 and
- * later incorporating L into the reduced argument.
- * compute 1+x as high, low parts
- */
- movaps %xmm4, %xmm7
-
-/*
- * Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E
- * The first FMR is exact (we force R to 12 bits just in case it
- * isn't already, to make absolutely sure), and since E is ~ 2^-12,
- * the rounding error in the other one is acceptable.
- */
- rcpps %xmm9, %xmm15
- subps %xmm9, %xmm8
- andps %xmm11, %xmm15
-/*
- * Split V as well into upper 12 bits and lower part, so that we can get
- * a preliminary quotient estimate without rounding error.
- */
- andps %xmm12, %xmm11
- mulps %xmm15, %xmm9
- addps %xmm8, %xmm10
- subps %xmm11, %xmm12
-
-/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
- mulps %xmm15, %xmm11
- mulps %xmm15, %xmm10
- subps %xmm9, %xmm14
- mulps %xmm12, %xmm15
- subps %xmm10, %xmm14
-
-/* Compute D = E + E^2 */
- movaps %xmm14, %xmm13
- movaps %xmm4, %xmm8
- mulps %xmm14, %xmm13
-
-/* reduction: compute r,n */
- movdqu iBrkValue+__svml_satanh_data_internal(%rip), %xmm9
- addps %xmm13, %xmm14
+#include <sysdep.h>
+#define TANHF_DATA(x) (x) + __svml_satanh_data_internal
-/*
- * Compute R * (VHi + VLo) * (1 + E + E^2)
- * = R * (VHi + VLo) * (1 + D)
- * = QHi + (QHi * D + QLo + QLo * D)
- */
- movaps %xmm14, %xmm0
- mulps %xmm15, %xmm14
- mulps %xmm11, %xmm0
- addps %xmm14, %xmm15
- movdqu iOffExpoMask+__svml_satanh_data_internal(%rip), %xmm12
- movaps %xmm4, %xmm14
-
-/* Record the sign for eventual reincorporation. */
- movups sSign+__svml_satanh_data_internal(%rip), %xmm1
- addps %xmm15, %xmm0
+ .text
+ .section .text.sse4, "ax", @progbits
+ENTRY(_ZGVbN4v_atanhf_sse4)
+ movaps %xmm0, %xmm5
+
+ /* Load constants including One = 1. */
+ movups TANHF_DATA(sOne)(%rip), %xmm4
+ movaps %xmm5, %xmm3
+
+ /* Strip off the sign, so treat X as positive until right at the end.
+ */
+ movups TANHF_DATA(SgnMask)(%rip), %xmm1
+ movaps %xmm4, %xmm2
+ andps %xmm1, %xmm0
+ movaps %xmm4, %xmm10
+ movups TANHF_DATA(sTopMask12)(%rip), %xmm11
+ movaps %xmm4, %xmm14
+ movaps %xmm11, %xmm9
+
+
+ /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
+ the upper part UHi being <= 12 bits long. Then we have atanh(X) = 1/2 *
+ log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). */
+ movaps %xmm0, %xmm6
+ mulps %xmm5, %xmm3
+ subps %xmm0, %xmm2
+ addps %xmm0, %xmm6
+ subps %xmm2, %xmm10
+ addps %xmm5, %xmm3
+ subps %xmm0, %xmm10
+ andps %xmm2, %xmm9
+
+
+ /* Now compute R = 1/(UHi+ULo) * (1 - E) and the error term E The first
+ FMR is exact (we force R to 12 bits just in case it isn't already, to
+ make absolutely sure), and since E is ~ 2^-12, the rounding error in the
+ other one is acceptable. */
+ rcpps %xmm9, %xmm7
+ subps %xmm9, %xmm2
+ andps %xmm11, %xmm7
+
+
+ /* Split V as well into upper 12 bits and lower part, so that we can get
+ a preliminary quotient estimate without rounding error. */
+
+ andps %xmm6, %xmm11
+ mulps %xmm7, %xmm9
+ addps %xmm2, %xmm10
+ subps %xmm11, %xmm6
+
+ /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.
+ */
+ mulps %xmm7, %xmm11
+ mulps %xmm7, %xmm10
+ subps %xmm9, %xmm14
+ mulps %xmm6, %xmm7
+ subps %xmm10, %xmm14
+
+ /* Compute D = E + E^2. */
+ movaps %xmm14, %xmm13
+ movaps %xmm4, %xmm8
+ mulps %xmm14, %xmm13
+
+ /* reduction: compute r,n. */
+ movdqu TANHF_DATA(iBrkValue)(%rip), %xmm9
+ addps %xmm13, %xmm14
+
+ /*
+ * Compute R * (VHi + VLo) * (1 + E + E^2)
+ * = R * (VHi + VLo) * (1 + D)
+ * = QHi + (QHi * D + QLo + QLo * D)
+ */
+ movaps %xmm14, %xmm2
+ mulps %xmm7, %xmm14
+ mulps %xmm11, %xmm2
+ addps %xmm14, %xmm7
+ movdqu TANHF_DATA(iOffExpoMask)(%rip), %xmm12
+ movaps %xmm4, %xmm14
+
+ /* Record the sign for eventual reincorporation. */
+ addps %xmm7, %xmm2
+
+
+ /* Now finally accumulate the high and low parts of the argument to
+ log1p, H + L, with a final compensated summation. */
+ movaps %xmm2, %xmm6
+ andnps %xmm5, %xmm1
+ movaps %xmm4, %xmm7
+ /* Or the sign bit in with the tiny result to handle atanh(-0)
+ correctly. */
+ addps %xmm11, %xmm6
+ maxps %xmm6, %xmm7
+ minps %xmm6, %xmm8
+ subps %xmm6, %xmm11
+ movaps %xmm7, %xmm10
+ addps %xmm8, %xmm10
+ addps %xmm11, %xmm2
+ subps %xmm10, %xmm7
+ psubd %xmm9, %xmm10
+ addps %xmm8, %xmm7
+ pand %xmm10, %xmm12
+ psrad $23, %xmm10
+ cvtdq2ps %xmm10, %xmm13
+ addps %xmm7, %xmm2
+
+ /* final reconstruction. */
+ pslld $23, %xmm10
+ paddd %xmm9, %xmm12
+ psubd %xmm10, %xmm14
+
+ /* polynomial evaluation. */
+ subps %xmm4, %xmm12
+ mulps %xmm14, %xmm2
+ movups TANHF_DATA(sPoly)(%rip), %xmm7
+ addps %xmm12, %xmm2
+ mulps %xmm2, %xmm7
+
+
+ /* Finally, halve the result and reincorporate the sign. */
+ addps TANHF_DATA(sPoly + 16)(%rip), %xmm7
+ mulps %xmm2, %xmm7
+ addps TANHF_DATA(sPoly + 32)(%rip), %xmm7
+ mulps %xmm2, %xmm7
+ addps TANHF_DATA(sPoly + 48)(%rip), %xmm7
+ mulps %xmm2, %xmm7
+ addps TANHF_DATA(sPoly + 64)(%rip), %xmm7
+ mulps %xmm2, %xmm7
+ addps TANHF_DATA(sPoly + 80)(%rip), %xmm7
+ mulps %xmm2, %xmm7
+ addps TANHF_DATA(sPoly + 96)(%rip), %xmm7
+ mulps %xmm2, %xmm7
+ movaps TANHF_DATA(sPoly + 112)(%rip), %xmm6
+ addps %xmm6, %xmm7
+ mulps %xmm2, %xmm7
+ mulps %xmm2, %xmm7
+ mulps TANHF_DATA(sLn2)(%rip), %xmm13
+ /* We can build `sHalf` with `sPoly & sOne`. */
+ andps %xmm4, %xmm6
+ orps %xmm1, %xmm3
+ xorps %xmm6, %xmm1
+
+ addps %xmm2, %xmm7
+ addps %xmm13, %xmm7
+ mulps %xmm7, %xmm1
+
+ /* Check whether |X| < 1, in which case we use the main function.
+ Otherwise set the rangemask so that the callout will get used. Note that
+ this will also use the callout for NaNs since not(NaN < 1). */
+ cmpleps %xmm0, %xmm4
+ movmskps %xmm4, %edx
+ cmpltps TANHF_DATA(TinyRange)(%rip), %xmm0
+
+ andps %xmm0, %xmm3
+ andnps %xmm1, %xmm0
+ orps %xmm3, %xmm0
+
+ testl %edx, %edx
+ /* Go to special inputs processing branch. */
+ jne L(SPECIAL_VALUES_BRANCH)
+
+ /* No registers to restore on fast path. */
+ ret
+
+ /* Branch to process special inputs. */
+L(SPECIAL_VALUES_BRANCH):
+ subq $56, %rsp
-/*
- * Now finally accumulate the high and low parts of the
- * argument to log1p, H + L, with a final compensated summation.
- */
- movaps %xmm0, %xmm6
- andps %xmm5, %xmm1
-
-/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
- orps %xmm1, %xmm3
- addps %xmm11, %xmm6
- maxps %xmm6, %xmm7
- minps %xmm6, %xmm8
- subps %xmm6, %xmm11
- movaps %xmm7, %xmm10
- andps %xmm2, %xmm3
- addps %xmm8, %xmm10
- addps %xmm11, %xmm0
- subps %xmm10, %xmm7
- psubd %xmm9, %xmm10
- addps %xmm7, %xmm8
- pand %xmm10, %xmm12
- psrad $23, %xmm10
- cvtdq2ps %xmm10, %xmm13
- addps %xmm8, %xmm0
-
-/* final reconstruction */
- mulps sLn2+__svml_satanh_data_internal(%rip), %xmm13
- pslld $23, %xmm10
- paddd %xmm9, %xmm12
- psubd %xmm10, %xmm14
-
-/* polynomial evaluation */
- subps %xmm4, %xmm12
- mulps %xmm0, %xmm14
- movups sPoly+112+__svml_satanh_data_internal(%rip), %xmm0
- addps %xmm12, %xmm14
- mulps %xmm14, %xmm0
-
-/* Finally, halve the result and reincorporate the sign */
- movups sHalf+__svml_satanh_data_internal(%rip), %xmm4
- pxor %xmm1, %xmm4
- addps sPoly+96+__svml_satanh_data_internal(%rip), %xmm0
- mulps %xmm14, %xmm0
- addps sPoly+80+__svml_satanh_data_internal(%rip), %xmm0
- mulps %xmm14, %xmm0
- addps sPoly+64+__svml_satanh_data_internal(%rip), %xmm0
- mulps %xmm14, %xmm0
- addps sPoly+48+__svml_satanh_data_internal(%rip), %xmm0
- mulps %xmm14, %xmm0
- addps sPoly+32+__svml_satanh_data_internal(%rip), %xmm0
- mulps %xmm14, %xmm0
- addps sPoly+16+__svml_satanh_data_internal(%rip), %xmm0
- mulps %xmm14, %xmm0
- addps sPoly+__svml_satanh_data_internal(%rip), %xmm0
- mulps %xmm14, %xmm0
- mulps %xmm14, %xmm0
- addps %xmm0, %xmm14
- movaps %xmm2, %xmm0
- addps %xmm13, %xmm14
- mulps %xmm14, %xmm4
- andnps %xmm4, %xmm0
- orps %xmm3, %xmm0
- testl %edx, %edx
-
-/* Go to special inputs processing branch */
- jne L(SPECIAL_VALUES_BRANCH)
- # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm5
-
-/* Restore registers
- * and exit the function
- */
-
-L(EXIT):
- addq $72, %rsp
- cfi_def_cfa_offset(8)
- ret
- cfi_def_cfa_offset(80)
-
-/* Branch to process
- * special inputs
- */
+ movups %xmm5, (%rsp)
+ movups %xmm0, 16(%rsp)
-L(SPECIAL_VALUES_BRANCH):
- movups %xmm5, 32(%rsp)
- movups %xmm0, 48(%rsp)
- # LOE rbx rbp r12 r13 r14 r15 edx
-
- xorl %eax, %eax
- movq %r12, 16(%rsp)
- cfi_offset(12, -64)
- movl %eax, %r12d
- movq %r13, 8(%rsp)
- cfi_offset(13, -72)
- movl %edx, %r13d
- movq %r14, (%rsp)
- cfi_offset(14, -80)
- # LOE rbx rbp r15 r12d r13d
-
-/* Range mask
- * bits check
- */
-
-L(RANGEMASK_CHECK):
- btl %r12d, %r13d
-
-/* Call scalar math function */
- jc L(SCALAR_MATH_CALL)
- # LOE rbx rbp r15 r12d r13d
-
-/* Special inputs
- * processing loop
- */
+ movq %r12, 32(%rsp)
+ movq %r13, 40(%rsp)
+ /* edx has 1s where there was a special value that needs to be handled
+ by a tanhf call. */
+ movl %edx, %r13d
L(SPECIAL_VALUES_LOOP):
- incl %r12d
- cmpl $4, %r12d
-
-/* Check bits in range mask */
- jl L(RANGEMASK_CHECK)
- # LOE rbx rbp r15 r12d r13d
-
- movq 16(%rsp), %r12
- cfi_restore(12)
- movq 8(%rsp), %r13
- cfi_restore(13)
- movq (%rsp), %r14
- cfi_restore(14)
- movups 48(%rsp), %xmm0
-
-/* Go to exit */
- jmp L(EXIT)
- cfi_offset(12, -64)
- cfi_offset(13, -72)
- cfi_offset(14, -80)
- # LOE rbx rbp r12 r13 r14 r15 xmm0
-
-/* Scalar math fucntion call
- * to process special input
- */
-
-L(SCALAR_MATH_CALL):
- movl %r12d, %r14d
- movss 32(%rsp,%r14,4), %xmm0
- call atanhf@PLT
- # LOE rbx rbp r14 r15 r12d r13d xmm0
-
- movss %xmm0, 48(%rsp,%r14,4)
-
-/* Process special inputs in loop */
- jmp L(SPECIAL_VALUES_LOOP)
- # LOE rbx rbp r15 r12d r13d
+ /* use r12 as index for special value that is saved across calls to
+ tanhf. We technically don't need a callee save register here as offset
+ to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+ Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+ in the loop. */
+ xorl %r12d, %r12d
+ bsfl %r13d, %r12d
+
+ /* Scalar math fucntion call to process special input. */
+ movss (%rsp, %r12, 4), %xmm0
+ call atanhf@PLT
+ /* No good way to avoid the store-forwarding fault this will cause on
+ return. `lfence` avoids the SF fault but at greater cost as it
+ serialized stack/callee save restoration. */
+ movss %xmm0, 16(%rsp, %r12, 4)
+
+ leal -1(%r13), %eax
+ andl %eax, %r13d
+ jnz L(SPECIAL_VALUES_LOOP)
+
+ /* All results have been written to 16(%rsp). */
+ vmovups 16(%rsp), %xmm0
+ movq 32(%rsp), %r12
+ movq 40(%rsp), %r13
+ addq $56, %rsp
+ ret
END(_ZGVbN4v_atanhf_sse4)
- .section .rodata, "a"
- .align 16
+ .section .rodata, "a"
+ .align 16
#ifdef __svml_satanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
- __declspec(align(16)) VUINT32 SgnMask[4][1];
- __declspec(align(16)) VUINT32 sOne[4][1];
- __declspec(align(16)) VUINT32 sPoly[8][4][1];
- __declspec(align(16)) VUINT32 iBrkValue[4][1];
- __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
- __declspec(align(16)) VUINT32 sHalf[4][1];
- __declspec(align(16)) VUINT32 sSign[4][1];
- __declspec(align(16)) VUINT32 sTopMask12[4][1];
- __declspec(align(16)) VUINT32 TinyRange[4][1];
- __declspec(align(16)) VUINT32 sLn2[4][1];
-} __svml_satanh_data_internal;
+ typedef unsigned int VUINT32;
+ typedef struct{
+ __declspec (align(16))VUINT32 sOne[4][1];
+ __declspec (align(16))VUINT32 SgnMask[4][1];
+ __declspec (align(16))VUINT32 sTopMask12[4][1];
+ __declspec (align(16))VUINT32 iBrkValue[4][1];
+ __declspec (align(16))VUINT32 iOffExpoMask[4][1];
+ __declspec (align(16))VUINT32 sPoly[8][4][1];
+ __declspec (align(16))VUINT32 sLn2[4][1];
+ __declspec (align(16))VUINT32 TinyRange[4][1];
+ }__svml_satanh_data_internal;
#endif
+
__svml_satanh_data_internal:
- /*== SgnMask ==*/
- .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
- /*== sOne = SP 1.0 ==*/
- .align 16
- .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
- /*== sPoly[] = SP polynomial ==*/
- .align 16
- .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
- .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
- .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
- .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
- .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
- .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
- .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
- .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
- /*== iBrkValue = SP 2/3 ==*/
- .align 16
- .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
- /*== iOffExpoMask = SP significand mask ==*/
- .align 16
- .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
- /*== sHalf ==*/
- .align 16
- .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
- /*== sSign ==*/
- .align 16
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
- /*== sTopMask12 ==*/
- .align 16
- .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
- /*== TinyRange ==*/
- .align 16
- .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
- /*== sLn2 = SP ln(2) ==*/
- .align 16
- .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
- .align 16
- .type __svml_satanh_data_internal,@object
- .size __svml_satanh_data_internal,.-__svml_satanh_data_internal
+ /* sOne = SP 1.0. */
+ .align 16
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+ /* SgnMask. */
+ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+ /* sTopMask12. */
+ .align 16
+ .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+ /* iBrkValue = SP 2/3. */
+ .align 16
+ .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+ /* iOffExpoMask = SP significand mask. */
+ .align 16
+ .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+
+ /* sPoly[] = SP polynomial. */
+ .align 16
+ .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7. */
+ .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6. */
+ .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5. */
+ .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4. */
+ .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3. */
+ .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2. */
+ .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1. */
+ .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0. */
+
+ /* sLn2 = SP ln(2). */
+ .align 16
+ .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+ /* TinyRange. */
+ .align 16
+ .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+ .align 16
+ .type __svml_satanh_data_internal, @object
+ .size __svml_satanh_data_internal, .-__svml_satanh_data_internal
@@ -30,306 +30,267 @@
*
*/
-/* Offsets for data table __svml_satanh_data_internal
- */
-#define SgnMask 0
-#define sOne 32
-#define sPoly 64
-#define iBrkValue 320
-#define iOffExpoMask 352
-#define sHalf 384
-#define sSign 416
-#define sTopMask12 448
-#define TinyRange 480
-#define sLn2 512
+
+ /* Offsets for data table __svml_satanh_data_internal. */
+#define SgnMask 0
+#define sOne 32
+#define sTopMask12 64
+#define TinyRange 96
+#define iBrkValue 128
+#define iOffExpoMask 160
+#define sPoly 192
+#define sLn2 448
+#define sHalf 480
#include <sysdep.h>
+#define TANHF_DATA(x) (x) + __svml_satanh_data_internal
- .text
- .section .text.avx2,"ax",@progbits
+ .text
+ .section .text.avx2, "ax", @progbits
ENTRY(_ZGVdN8v_atanhf_avx2)
- pushq %rbp
- cfi_def_cfa_offset(16)
- movq %rsp, %rbp
- cfi_def_cfa(6, 16)
- cfi_offset(6, -16)
- andq $-32, %rsp
- subq $96, %rsp
-
-/* Load constants including One = 1 */
- vmovups sOne+__svml_satanh_data_internal(%rip), %ymm5
- vmovups sTopMask12+__svml_satanh_data_internal(%rip), %ymm13
- vmovaps %ymm0, %ymm6
-
-/* Strip off the sign, so treat X as positive until right at the end */
- vandps SgnMask+__svml_satanh_data_internal(%rip), %ymm6, %ymm10
- vsubps %ymm10, %ymm5, %ymm1
-
-/*
- * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
- * the upper part UHi being <= 12 bits long. Then we have
- * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
- */
- vaddps %ymm10, %ymm10, %ymm14
-
-/*
- * Check whether |X| < 1, in which case we use the main function.
- * Otherwise set the rangemask so that the callout will get used.
- * Note that this will also use the callout for NaNs since not(NaN < 1).
- */
- vcmpnlt_uqps %ymm5, %ymm10, %ymm7
- vsubps %ymm1, %ymm5, %ymm9
- vcmplt_oqps TinyRange+__svml_satanh_data_internal(%rip), %ymm10, %ymm4
- vrcpps %ymm1, %ymm11
- vsubps %ymm10, %ymm9, %ymm12
- vandps %ymm13, %ymm11, %ymm0
-
-/* No need to split sU when FMA is available */
- vfnmadd213ps %ymm5, %ymm0, %ymm1
- vmovaps %ymm6, %ymm8
- vfmadd213ps %ymm6, %ymm6, %ymm8
- vfnmadd231ps %ymm0, %ymm12, %ymm1
-
-/*
- * Split V as well into upper 12 bits and lower part, so that we can get
- * a preliminary quotient estimate without rounding error.
- */
- vandps %ymm13, %ymm14, %ymm15
- vmovmskps %ymm7, %edx
- vsubps %ymm15, %ymm14, %ymm7
-
-/* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
- vmulps %ymm15, %ymm0, %ymm10
-
-/* Compute D = E + E^2 */
- vfmadd213ps %ymm1, %ymm1, %ymm1
-
-/* Record the sign for eventual reincorporation. */
- vandps sSign+__svml_satanh_data_internal(%rip), %ymm6, %ymm3
-
-/* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
- vorps %ymm3, %ymm8, %ymm2
- vmulps %ymm7, %ymm0, %ymm8
-
-/*
- * Compute R * (VHi + VLo) * (1 + E + E^2)
- * = R * (VHi + VLo) * (1 + D)
- * = QHi + (QHi * D + QLo + QLo * D)
- */
- vmulps %ymm1, %ymm10, %ymm9
- vfmadd213ps %ymm8, %ymm8, %ymm1
- vaddps %ymm1, %ymm9, %ymm1
-
-/* reduction: compute r,n */
- vmovups iBrkValue+__svml_satanh_data_internal(%rip), %ymm9
-
-/*
- * Now finally accumulate the high and low parts of the
- * argument to log1p, H + L, with a final compensated summation.
- */
- vaddps %ymm1, %ymm10, %ymm12
- vsubps %ymm12, %ymm10, %ymm11
-
-/*
- * Now we feed into the log1p code, using H in place of _VARG1 and
- * later incorporating L into the reduced argument.
- * compute 1+x as high, low parts
- */
- vmaxps %ymm12, %ymm5, %ymm13
- vminps %ymm12, %ymm5, %ymm14
- vaddps %ymm11, %ymm1, %ymm0
- vaddps %ymm14, %ymm13, %ymm1
- vpsubd %ymm9, %ymm1, %ymm7
- vsubps %ymm1, %ymm13, %ymm15
- vpsrad $23, %ymm7, %ymm10
- vpand iOffExpoMask+__svml_satanh_data_internal(%rip), %ymm7, %ymm8
- vaddps %ymm15, %ymm14, %ymm13
- vpslld $23, %ymm10, %ymm11
- vpaddd %ymm9, %ymm8, %ymm15
- vaddps %ymm13, %ymm0, %ymm14
- vcvtdq2ps %ymm10, %ymm0
- vpsubd %ymm11, %ymm5, %ymm12
-
-/* polynomial evaluation */
- vsubps %ymm5, %ymm15, %ymm5
- vmulps %ymm14, %ymm12, %ymm1
- vaddps %ymm5, %ymm1, %ymm5
- vmovups sPoly+224+__svml_satanh_data_internal(%rip), %ymm1
- vfmadd213ps sPoly+192+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
- vfmadd213ps sPoly+160+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
- vfmadd213ps sPoly+128+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
- vfmadd213ps sPoly+96+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
- vfmadd213ps sPoly+64+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
- vfmadd213ps sPoly+32+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
- vfmadd213ps sPoly+__svml_satanh_data_internal(%rip), %ymm5, %ymm1
- vmulps %ymm1, %ymm5, %ymm7
- vfmadd213ps %ymm5, %ymm5, %ymm7
-
-/* final reconstruction */
- vfmadd132ps sLn2+__svml_satanh_data_internal(%rip), %ymm7, %ymm0
-
-/* Finally, halve the result and reincorporate the sign */
- vxorps sHalf+__svml_satanh_data_internal(%rip), %ymm3, %ymm3
- vmulps %ymm0, %ymm3, %ymm0
- vblendvps %ymm4, %ymm2, %ymm0, %ymm0
- testl %edx, %edx
-
-/* Go to special inputs processing branch */
- jne L(SPECIAL_VALUES_BRANCH)
- # LOE rbx r12 r13 r14 r15 edx ymm0 ymm6
-
-/* Restore registers
- * and exit the function
- */
-
-L(EXIT):
- movq %rbp, %rsp
- popq %rbp
- cfi_def_cfa(7, 8)
- cfi_restore(6)
- ret
- cfi_def_cfa(6, 16)
- cfi_offset(6, -16)
-
-/* Branch to process
- * special inputs
- */
-
+ /* Strip off the sign, so treat X as positive until right at the end.
+ */
+ vmovaps TANHF_DATA(SgnMask)(%rip), %ymm2
+ vandps %ymm2, %ymm0, %ymm3
+ /* Load constants including One = 1. */
+ vmovups TANHF_DATA(sOne)(%rip), %ymm5
+ vsubps %ymm3, %ymm5, %ymm1
+ vmovups TANHF_DATA(sTopMask12)(%rip), %ymm4
+
+ vrcpps %ymm1, %ymm7
+ vsubps %ymm1, %ymm5, %ymm9
+ vandps %ymm4, %ymm7, %ymm6
+ vsubps %ymm3, %ymm9, %ymm7
+
+ /* No need to split sU when FMA is available. */
+ vfnmadd213ps %ymm5, %ymm6, %ymm1
+ vmovaps %ymm0, %ymm8
+ vfmadd213ps %ymm0, %ymm0, %ymm0
+ vfnmadd231ps %ymm6, %ymm7, %ymm1
+
+ /* Check whether |X| < 1, in which case we use the main function.
+ Otherwise set the rangemask so that the callout will get used. Note that
+ this will also use the callout for NaNs since not(NaN < 1). */
+ vcmpnlt_uqps %ymm5, %ymm3, %ymm14
+ vcmplt_oqps TANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15
+
+ /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
+ the upper part UHi being <= 12 bits long. Then we have atanh(X) = 1/2 *
+ log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). */
+ vaddps %ymm3, %ymm3, %ymm3
+
+ /* Split V as well into upper 12 bits and lower part, so that we can get
+ a preliminary quotient estimate without rounding error. */
+ vandps %ymm4, %ymm3, %ymm4
+ vsubps %ymm4, %ymm3, %ymm7
+
+ /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo.
+ */
+ vmulps %ymm4, %ymm6, %ymm4
+
+ /* Compute D = E + E^2. */
+ vfmadd213ps %ymm1, %ymm1, %ymm1
+
+ /* Record the sign for eventual reincorporation. */
+ vandnps %ymm8, %ymm2, %ymm3
+
+ /* Or the sign bit in with the tiny result to handle atanh(-0)
+ correctly. */
+ vorps %ymm3, %ymm0, %ymm13
+ vmulps %ymm7, %ymm6, %ymm2
+
+ /*
+ Compute R * (VHi + VLo) * (1 + E + E^2)
+ = R * (VHi + VLo) * (1 + D)
+ = QHi + (QHi * D + QLo + QLo * D)
+ */
+ /* If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9;
+ vaddps %ymm1, %ymm9, %ymm1` can be replaced with `vfmadd231ps %ymm1,
+ %ymm4, %ymm4`. */
+ vmulps %ymm1, %ymm4, %ymm6
+ vfmadd213ps %ymm2, %ymm2, %ymm1
+ vaddps %ymm1, %ymm6, %ymm1
+
+ /* Now finally accumulate the high and low parts of the argument to
+ log1p, H + L, with a final compensated summation. */
+ vaddps %ymm1, %ymm4, %ymm2
+
+ /* reduction: compute r,n. */
+ vmovups TANHF_DATA(iBrkValue)(%rip), %ymm9
+
+ /* Now we feed into the log1p code, using H in place of _VARG1 and later
+ incorporating L into the reduced argument. compute 1+x as high, low
+ parts. */
+ vmaxps %ymm2, %ymm5, %ymm0
+ vminps %ymm2, %ymm5, %ymm6
+
+ /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`). */
+ vsubps %ymm2, %ymm4, %ymm2
+ vaddps %ymm6, %ymm0, %ymm4
+ vpsubd %ymm9, %ymm4, %ymm7
+ vsubps %ymm4, %ymm0, %ymm4
+ vaddps %ymm2, %ymm1, %ymm2
+ vmovaps TANHF_DATA(iOffExpoMask)(%rip), %ymm1
+
+ vandps %ymm1, %ymm7, %ymm0
+ vaddps %ymm4, %ymm6, %ymm4
+ vandnps %ymm7, %ymm1, %ymm6
+ vmovups TANHF_DATA(sPoly)(%rip), %ymm1
+ vpaddd %ymm9, %ymm0, %ymm0
+ vaddps %ymm4, %ymm2, %ymm4
+ vpsubd %ymm6, %ymm5, %ymm6
+
+ /* polynomial evaluation. */
+ vsubps %ymm5, %ymm0, %ymm2
+ vfmadd231ps %ymm4, %ymm6, %ymm2
+ vfmadd213ps TANHF_DATA(sPoly + 32)(%rip), %ymm2, %ymm1
+ vfmadd213ps TANHF_DATA(sPoly + 64)(%rip), %ymm2, %ymm1
+ vfmadd213ps TANHF_DATA(sPoly + 96)(%rip), %ymm2, %ymm1
+ vfmadd213ps TANHF_DATA(sPoly + 128)(%rip), %ymm2, %ymm1
+ vfmadd213ps TANHF_DATA(sPoly + 160)(%rip), %ymm2, %ymm1
+ vfmadd213ps TANHF_DATA(sPoly + 192)(%rip), %ymm2, %ymm1
+ vfmadd213ps TANHF_DATA(sPoly + 224)(%rip), %ymm2, %ymm1
+
+ vmulps %ymm1, %ymm2, %ymm1
+ vfmadd213ps %ymm2, %ymm2, %ymm1
+
+ /* final reconstruction. */
+ vpsrad $23, %ymm7, %ymm6
+ vcvtdq2ps %ymm6, %ymm2
+ vfmadd132ps TANHF_DATA(sLn2)(%rip), %ymm1, %ymm2
+
+ /* Finally, halve the result and reincorporate the sign. */
+ vxorps TANHF_DATA(sHalf)(%rip), %ymm3, %ymm3
+ vmulps %ymm2, %ymm3, %ymm2
+ vmovmskps %ymm14, %edx
+ testl %edx, %edx
+
+ vblendvps %ymm15, %ymm13, %ymm2, %ymm0
+ /* Go to special inputs processing branch. */
+ jne L(SPECIAL_VALUES_BRANCH)
+
+ /* No registers to restore on fast path. */
+ ret
+
+
+ /* Branch to process special inputs. */
L(SPECIAL_VALUES_BRANCH):
- vmovups %ymm6, 32(%rsp)
- vmovups %ymm0, 64(%rsp)
- # LOE rbx r12 r13 r14 r15 edx ymm0
-
- xorl %eax, %eax
- # LOE rbx r12 r13 r14 r15 eax edx
-
- vzeroupper
- movq %r12, 16(%rsp)
- /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
- .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
- movl %eax, %r12d
- movq %r13, 8(%rsp)
- /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
- .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
- movl %edx, %r13d
- movq %r14, (%rsp)
- /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
- .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
- # LOE rbx r15 r12d r13d
-
-/* Range mask
- * bits check
- */
-
-L(RANGEMASK_CHECK):
- btl %r12d, %r13d
-
-/* Call scalar math function */
- jc L(SCALAR_MATH_CALL)
- # LOE rbx r15 r12d r13d
-
-/* Special inputs
- * processing loop
- */
-
+ pushq %rbp
+ /* Need to callee save registers to preserve state across tanhf calls.
+ */
+ pushq %r12
+ pushq %r13
+ movq %rsp, %rbp
+
+ /* Align stack and make room for 2x ymm vectors. */
+ andq $-32, %rsp
+ addq $-64, %rsp
+
+ /* Save origional input (ymm0 unchanged up to this point). */
+ vmovups %ymm0, (%rsp)
+ /* Save all already computed inputs. */
+ vmovups %ymm8, 32(%rsp)
+
+ vzeroupper
+
+ /* edx has 1s where there was a special value that needs to be handled
+ by a tanhf call. */
+ movl %edx, %r13d
L(SPECIAL_VALUES_LOOP):
- incl %r12d
- cmpl $8, %r12d
-
-/* Check bits in range mask */
- jl L(RANGEMASK_CHECK)
- # LOE rbx r15 r12d r13d
-
- movq 16(%rsp), %r12
- cfi_restore(12)
- movq 8(%rsp), %r13
- cfi_restore(13)
- movq (%rsp), %r14
- cfi_restore(14)
- vmovups 64(%rsp), %ymm0
-
-/* Go to exit */
- jmp L(EXIT)
- /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
- .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
- /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
- .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
- /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
- .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
- # LOE rbx r12 r13 r14 r15 ymm0
-
-/* Scalar math fucntion call
- * to process special input
- */
-
-L(SCALAR_MATH_CALL):
- movl %r12d, %r14d
- movss 32(%rsp,%r14,4), %xmm0
- call atanhf@PLT
- # LOE rbx r14 r15 r12d r13d xmm0
-
- movss %xmm0, 64(%rsp,%r14,4)
-
-/* Process special inputs in loop */
- jmp L(SPECIAL_VALUES_LOOP)
- # LOE rbx r15 r12d r13d
+ /* use r12 as index for special value that is saved across calls to
+ tanhf. We technically don't need a callee save register here as offset
+ to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+ Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+ in the loop. */
+ xorl %r12d, %r12d
+ tzcntl %r13d, %r12d
+
+ /* Scalar math fucntion call to process special input. */
+ movss 32(%rsp, %r12, 4), %xmm0
+ call atanhf@PLT
+ /* No good way to avoid the store-forwarding fault this will cause on
+ return. `lfence` avoids the SF fault but at greater cost as it
+ serialized stack/callee save restoration. */
+ movss %xmm0, (%rsp, %r12, 4)
+
+ blsr %r13d, %r13d
+ jnz L(SPECIAL_VALUES_LOOP)
+
+ /* All results have been written to 32(%rsp). */
+ vmovups (%rsp), %ymm0
+ movq %rbp, %rsp
+ popq %r13
+ popq %r12
+ popq %rbp
+ ret
END(_ZGVdN8v_atanhf_avx2)
- .section .rodata, "a"
- .align 32
-
+ .section .rodata, "a"
+ .align 32
#ifdef __svml_satanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct {
- __declspec(align(32)) VUINT32 SgnMask[8][1];
- __declspec(align(32)) VUINT32 sOne[8][1];
- __declspec(align(32)) VUINT32 sPoly[8][8][1];
- __declspec(align(32)) VUINT32 iBrkValue[8][1];
- __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
- __declspec(align(32)) VUINT32 sHalf[8][1];
- __declspec(align(32)) VUINT32 sSign[8][1];
- __declspec(align(32)) VUINT32 sTopMask12[8][1];
- __declspec(align(32)) VUINT32 TinyRange[8][1];
- __declspec(align(32)) VUINT32 sLn2[8][1];
-} __svml_satanh_data_internal;
+ typedef unsigned int VUINT32;
+ typedef struct{
+ __declspec (align(32))VUINT32 SgnMask[8][1];
+ __declspec (align(32))VUINT32 sOne[8][1];
+ __declspec (align(32))VUINT32 sTopMask12[8][1];
+ __declspec (align(32))VUINT32 TinyRange[8][1];
+ __declspec (align(32))VUINT32 iBrkValue[8][1];
+ __declspec (align(32))VUINT32 iOffExpoMask[8][1];
+ __declspec (align(32))VUINT32 sPoly[8][8][1];
+ __declspec (align(32))VUINT32 sLn2[8][1];
+ __declspec (align(32))VUINT32 sHalf[8][1];
+ }__svml_satanh_data_internal;
#endif
__svml_satanh_data_internal:
- /*== SgnMask ==*/
- .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
- /*== sOne = SP 1.0 ==*/
- .align 32
- .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
- /*== sPoly[] = SP polynomial ==*/
- .align 32
- .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
- .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
- .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
- .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
- .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
- .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
- .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
- .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
- /*== iBrkValue = SP 2/3 ==*/
- .align 32
- .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
- /*== iOffExpoMask = SP significand mask ==*/
- .align 32
- .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
- /*== sHalf ==*/
- .align 32
- .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
- /*== sSign ==*/
- .align 32
- .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
- /*== sTopMask12 ==*/
- .align 32
- .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
- /*== TinyRange ==*/
- .align 32
- .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
- /*== sLn2 = SP ln(2) ==*/
- .align 32
- .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
- .align 32
- .type __svml_satanh_data_internal,@object
- .size __svml_satanh_data_internal,.-__svml_satanh_data_internal
+ /* SgnMask. */
+ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+ .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+ /* sOne = SP 1.0. */
+ .align 32
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+ .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+ /* sTopMask12. */
+ .align 32
+ .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+ .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
+ /* TinyRange. */
+ .align 32
+ .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+ .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
+ /* iBrkValue = SP 2/3. */
+ .align 32
+ .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+ .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
+ /* iOffExpoMask = SP significand mask. */
+ .align 32
+ .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+ .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
+ /* sPoly[] = SP polynomial. */
+ .align 32
+ .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed
+ .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7. */
+ .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3
+ .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6. */
+ .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12
+ .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5. */
+ .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37
+ .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4. */
+ .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190
+ .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3. */
+ .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e
+ .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2. */
+ .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94
+ .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1. */
+ .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+ .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0. */
+ /* sLn2 = SP ln(2). */
+ .align 32
+ .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+ .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
+ /* sHalf. */
+ .align 32
+ .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
+ .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
+ .align 32
+ .type __svml_satanh_data_internal, @object
+ .size __svml_satanh_data_internal, .-__svml_satanh_data_internal