x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
This patch fixes SSE4.2 libmvec atan2 function accuracy for following
inputs to less than 4 ulps.
{0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps
{0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps
This fixes BZ #28765.
---
.../fpu/multiarch/svml_d_atan22_core_sse4.S | 321 ++++++++++--------
1 file changed, 173 insertions(+), 148 deletions(-)
Comments
On Wed, Jan 12, 2022 at 12:43 PM Sunil K Pandey <skpgkp2@gmail.com> wrote:
>
> This patch fixes SSE4.2 libmvec atan2 function accuracy for following
> inputs to less than 4 ulps.
>
> {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps
> {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps
>
> This fixes BZ #28765.
> ---
> .../fpu/multiarch/svml_d_atan22_core_sse4.S | 321 ++++++++++--------
> 1 file changed, 173 insertions(+), 148 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
> index 4983051323..138ff2ffa0 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
> @@ -65,7 +65,7 @@
> ENTRY(_ZGVbN2vv_atan2_sse4)
> subq $88, %rsp
> cfi_def_cfa_offset(96)
> - movaps %xmm0, %xmm8
> + movaps %xmm1, %xmm11
>
> /*
> * #define NO_VECTOR_ZERO_ATAN2_ARGS
> @@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
> * Cannot be replaced by VQRCP(D, dR0, dB);
> * Argument Absolute values
> */
> - movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
> + movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
> + movaps %xmm0, %xmm10
> movaps %xmm1, %xmm9
> - movaps %xmm4, %xmm1
> - andps %xmm8, %xmm4
> - andps %xmm9, %xmm1
> - movaps %xmm4, %xmm2
> - cmpnltpd %xmm1, %xmm2
> + andps %xmm10, %xmm1
> + andps %xmm11, %xmm9
> + movaps %xmm1, %xmm4
> + cmpnltpd %xmm9, %xmm4
>
> /* Argument signs */
> - movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
> - movaps %xmm2, %xmm0
> - movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5
> - movaps %xmm3, %xmm7
> - movaps %xmm3, %xmm6
> + movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
> + movaps %xmm4, %xmm0
> + movaps %xmm5, %xmm8
> + movaps %xmm5, %xmm7
>
> /*
> * 1) If y<x then a= y, b=x, PIO2=0
> * 2) If y>x then a=-x, b=y, PIO2=Pi/2
> */
> - orps %xmm1, %xmm3
> - movaps %xmm2, %xmm10
> - andps %xmm2, %xmm5
> - andnps %xmm4, %xmm0
> - andps %xmm2, %xmm3
> - andnps %xmm1, %xmm10
> - andps %xmm4, %xmm2
> - orps %xmm3, %xmm0
> - orps %xmm2, %xmm10
> - divpd %xmm10, %xmm0
> - movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
> -
> -/* if x<0, dPI = Pi, else dPI =0 */
> - movaps %xmm9, %xmm3
> + orps %xmm9, %xmm5
> + andnps %xmm1, %xmm0
> + andps %xmm4, %xmm5
> + andps %xmm11, %xmm8
> + movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6
> + orps %xmm5, %xmm0
> + movaps %xmm4, %xmm5
> + andps %xmm4, %xmm6
> + andnps %xmm9, %xmm5
> + andps %xmm1, %xmm4
> + orps %xmm4, %xmm5
> + andps %xmm10, %xmm7
> + divpd %xmm5, %xmm0
> + movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
> + xorl %edx, %edx
>
> /* Check if y and x are on main path. */
> - pshufd $221, %xmm1, %xmm12
> - andps %xmm9, %xmm7
> - psubd %xmm11, %xmm12
> - andps %xmm8, %xmm6
> - movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
> - xorl %edx, %edx
> - movups %xmm4, 16(%rsp)
> + pshufd $221, %xmm9, %xmm3
> xorl %eax, %eax
> - pshufd $221, %xmm4, %xmm14
> - movdqa %xmm12, %xmm4
> - pcmpgtd %xmm13, %xmm4
> - pcmpeqd %xmm13, %xmm12
> - por %xmm12, %xmm4
> + pshufd $221, %xmm1, %xmm13
> + psubd %xmm2, %xmm3
> + psubd %xmm2, %xmm13
> + movdqa %xmm3, %xmm4
> + movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
> + movdqa %xmm13, %xmm14
> + pcmpgtd %xmm12, %xmm4
> + pcmpeqd %xmm12, %xmm3
> + pcmpgtd %xmm12, %xmm14
> + pcmpeqd %xmm12, %xmm13
>
> /* Polynomial. */
> movaps %xmm0, %xmm12
> + por %xmm3, %xmm4
> mulpd %xmm0, %xmm12
> - cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3
> - psubd %xmm11, %xmm14
> - movdqa %xmm14, %xmm15
> - pcmpeqd %xmm13, %xmm14
> - pcmpgtd %xmm13, %xmm15
> - por %xmm14, %xmm15
> - movaps %xmm12, %xmm14
> - mulpd %xmm12, %xmm14
> - por %xmm15, %xmm4
> - movaps %xmm14, %xmm15
> - mulpd %xmm14, %xmm15
> - movmskps %xmm4, %ecx
> - movups %xmm10, (%rsp)
> - movups dA19+__svml_datan2_data_internal(%rip), %xmm10
> - mulpd %xmm15, %xmm10
> - movups dA18+__svml_datan2_data_internal(%rip), %xmm13
> - movups dA17+__svml_datan2_data_internal(%rip), %xmm11
> - addpd dA15+__svml_datan2_data_internal(%rip), %xmm10
> - mulpd %xmm15, %xmm13
> - mulpd %xmm15, %xmm11
> - mulpd %xmm15, %xmm10
> - addpd dA14+__svml_datan2_data_internal(%rip), %xmm13
> - addpd dA13+__svml_datan2_data_internal(%rip), %xmm11
> - addpd dA11+__svml_datan2_data_internal(%rip), %xmm10
> - mulpd %xmm15, %xmm13
> - mulpd %xmm15, %xmm11
> - mulpd %xmm15, %xmm10
> - addpd dA10+__svml_datan2_data_internal(%rip), %xmm13
> - addpd dA09+__svml_datan2_data_internal(%rip), %xmm11
> - addpd dA07+__svml_datan2_data_internal(%rip), %xmm10
> - mulpd %xmm15, %xmm13
> - mulpd %xmm15, %xmm11
> - mulpd %xmm15, %xmm10
> - addpd dA06+__svml_datan2_data_internal(%rip), %xmm13
> - addpd dA05+__svml_datan2_data_internal(%rip), %xmm11
> - addpd dA03+__svml_datan2_data_internal(%rip), %xmm10
> - mulpd %xmm15, %xmm13
> - mulpd %xmm15, %xmm11
> - mulpd %xmm12, %xmm10
> - addpd dA02+__svml_datan2_data_internal(%rip), %xmm13
> - addpd dA01+__svml_datan2_data_internal(%rip), %xmm11
> - addpd %xmm10, %xmm13
> - mulpd %xmm11, %xmm12
> - mulpd %xmm13, %xmm14
> - movups dA16+__svml_datan2_data_internal(%rip), %xmm2
> - mulpd %xmm15, %xmm2
> - addpd dA12+__svml_datan2_data_internal(%rip), %xmm2
> - mulpd %xmm15, %xmm2
> - addpd dA08+__svml_datan2_data_internal(%rip), %xmm2
> - mulpd %xmm15, %xmm2
> - addpd dA04+__svml_datan2_data_internal(%rip), %xmm2
> -
> -/* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */
> - mulpd %xmm2, %xmm15
> - addpd %xmm12, %xmm15
> - addpd %xmm14, %xmm15
> +
> +/* P = A19*R2 + A18 */
> + movups dA19+__svml_datan2_data_internal(%rip), %xmm15
> + movaps %xmm11, %xmm2
> + mulpd %xmm12, %xmm15
> + addpd dA18+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A17 */
> + mulpd %xmm12, %xmm15
> + addpd dA17+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A16 */
> + mulpd %xmm12, %xmm15
> + addpd dA16+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A15 */
> + mulpd %xmm12, %xmm15
> + addpd dA15+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A14 */
> + mulpd %xmm12, %xmm15
> + addpd dA14+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A13 */
> + mulpd %xmm12, %xmm15
> + addpd dA13+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A12 */
> + mulpd %xmm12, %xmm15
> + addpd dA12+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A11 */
> + mulpd %xmm12, %xmm15
> + addpd dA11+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A10 */
> + mulpd %xmm12, %xmm15
> + addpd dA10+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A09 */
> + mulpd %xmm12, %xmm15
> + addpd dA09+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A08 */
> + mulpd %xmm12, %xmm15
> + addpd dA08+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A07 */
> + mulpd %xmm12, %xmm15
> + addpd dA07+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A06 */
> + mulpd %xmm12, %xmm15
> + addpd dA06+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A05 */
> + mulpd %xmm12, %xmm15
> + addpd dA05+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A04 */
> + mulpd %xmm12, %xmm15
> + addpd dA04+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A03 */
> + mulpd %xmm12, %xmm15
> + addpd dA03+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A02 */
> + mulpd %xmm12, %xmm15
> + addpd dA02+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A01 */
> + mulpd %xmm12, %xmm15
> + addpd dA01+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 */
> + mulpd %xmm15, %xmm12
>
> /*
> * Reconstruction.
> * dP=(R+R*dP) + dPIO2
> */
> - mulpd %xmm0, %xmm15
> - addpd %xmm15, %xmm0
> - addpd %xmm5, %xmm0
> - andps __svml_datan2_data_internal(%rip), %xmm3
> + mulpd %xmm0, %xmm12
> + addpd %xmm12, %xmm0
> +
> +/* if x<0, dPI = Pi, else dPI =0 */
> + movups dZERO+__svml_datan2_data_internal(%rip), %xmm3
> + por %xmm13, %xmm14
> + cmplepd %xmm3, %xmm2
> + addpd %xmm6, %xmm0
> + andps __svml_datan2_data_internal(%rip), %xmm2
> + orps %xmm8, %xmm0
> + addpd %xmm2, %xmm0
> + por %xmm14, %xmm4
> orps %xmm7, %xmm0
> - addpd %xmm3, %xmm0
> + movmskps %xmm4, %ecx
>
> /* Special branch for fast (vector) processing of zero arguments */
> - movups 16(%rsp), %xmm11
> - orps %xmm6, %xmm0
> testb $3, %cl
>
> /* Go to auxilary branch */
> jne L(AUX_BRANCH)
> - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
> + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
>
> /* Return from auxilary branch
> * for out of main path inputs
> @@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
>
> /* Go to special inputs processing branch */
> jne L(SPECIAL_VALUES_BRANCH)
> - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
> + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
>
> /* Restore registers
> * and exit the function
> @@ -237,8 +264,8 @@ L(EXIT):
> */
>
> L(SPECIAL_VALUES_BRANCH):
> - movups %xmm8, 32(%rsp)
> - movups %xmm9, 48(%rsp)
> + movups %xmm10, 32(%rsp)
> + movups %xmm11, 48(%rsp)
> movups %xmm0, 64(%rsp)
> # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
>
> @@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
> */
>
> L(AUX_BRANCH):
> -/* Check if at least on of Y or Y is zero: iAXAYZERO */
> - movups dZERO+__svml_datan2_data_internal(%rip), %xmm2
> -
> /* Check if both X & Y are not NaNs: iXYnotNAN */
> - movaps %xmm9, %xmm12
> - movaps %xmm8, %xmm10
> - cmpordpd %xmm9, %xmm12
> - cmpordpd %xmm8, %xmm10
> - cmpeqpd %xmm2, %xmm1
> - cmpeqpd %xmm2, %xmm11
> - andps %xmm10, %xmm12
> - orps %xmm11, %xmm1
> - pshufd $221, %xmm1, %xmm1
> - pshufd $221, %xmm12, %xmm11
> + movaps %xmm11, %xmm13
> + movaps %xmm10, %xmm12
> + cmpordpd %xmm11, %xmm13
> + cmpordpd %xmm10, %xmm12
>
> -/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
> - pand %xmm11, %xmm1
> -
> -/* Exclude from previous callout mask zero (and not NaN) arguments */
> - movdqa %xmm1, %xmm13
> - pandn %xmm4, %xmm13
> +/* Check if at least on of Y or Y is zero: iAXAYZERO */
> + cmpeqpd %xmm3, %xmm9
> + cmpeqpd %xmm3, %xmm1
>
> /*
> * Path for zero arguments (at least one of both)
> * Check if both args are zeros (den. is zero)
> */
> - movups (%rsp), %xmm4
> - cmpeqpd %xmm2, %xmm4
> + cmpeqpd %xmm3, %xmm5
> + andps %xmm12, %xmm13
> + orps %xmm1, %xmm9
> + pshufd $221, %xmm9, %xmm1
> + pshufd $221, %xmm13, %xmm9
>
> -/* Go to callout */
> - movmskps %xmm13, %edx
> +/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
> + pand %xmm9, %xmm1
> +
> +/* Exclude from previous callout mask zero (and not NaN) arguments */
> + movdqa %xmm1, %xmm14
> + pandn %xmm4, %xmm14
>
> /* Set sPIO2 to zero if den. is zero */
> - movaps %xmm4, %xmm15
> - andps %xmm2, %xmm4
> - andnps %xmm5, %xmm15
> - andl $3, %edx
> - orps %xmm4, %xmm15
> - pshufd $221, %xmm9, %xmm5
> - orps %xmm7, %xmm15
> + movaps %xmm5, %xmm4
> + andnps %xmm6, %xmm4
> + andps %xmm3, %xmm5
>
> /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
> - pshufd $221, %xmm2, %xmm7
> - pcmpgtd %xmm5, %xmm7
> - pshufd $80, %xmm7, %xmm14
> - andps %xmm3, %xmm14
> - addpd %xmm14, %xmm15
> + pshufd $221, %xmm3, %xmm3
> + orps %xmm5, %xmm4
> + pshufd $221, %xmm11, %xmm5
> + orps %xmm8, %xmm4
> + pcmpgtd %xmm5, %xmm3
> + pshufd $80, %xmm3, %xmm6
> + andps %xmm2, %xmm6
> + addpd %xmm6, %xmm4
> +
> +/* Go to callout */
> + movmskps %xmm14, %edx
>
> /* Merge results from main and spec path */
> - pshufd $80, %xmm1, %xmm3
> - orps %xmm6, %xmm15
> - movdqa %xmm3, %xmm6
> - andps %xmm3, %xmm15
> - andnps %xmm0, %xmm6
> - movaps %xmm6, %xmm0
> - orps %xmm15, %xmm0
> + pshufd $80, %xmm1, %xmm2
> + orps %xmm7, %xmm4
> + movdqa %xmm2, %xmm7
> + andps %xmm2, %xmm4
> + andnps %xmm0, %xmm7
> + andl $3, %edx
> + movaps %xmm7, %xmm0
> + orps %xmm4, %xmm0
>
> /* Return to main vector processing path */
> jmp L(AUX_BRANCH_RETURN)
> - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
> + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
> END(_ZGVbN2vv_atan2_sse4)
>
> .section .rodata, "a"
> --
> 2.34.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
@@ -65,7 +65,7 @@
ENTRY(_ZGVbN2vv_atan2_sse4)
subq $88, %rsp
cfi_def_cfa_offset(96)
- movaps %xmm0, %xmm8
+ movaps %xmm1, %xmm11
/*
* #define NO_VECTOR_ZERO_ATAN2_ARGS
@@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
* Cannot be replaced by VQRCP(D, dR0, dB);
* Argument Absolute values
*/
- movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
+ movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
+ movaps %xmm0, %xmm10
movaps %xmm1, %xmm9
- movaps %xmm4, %xmm1
- andps %xmm8, %xmm4
- andps %xmm9, %xmm1
- movaps %xmm4, %xmm2
- cmpnltpd %xmm1, %xmm2
+ andps %xmm10, %xmm1
+ andps %xmm11, %xmm9
+ movaps %xmm1, %xmm4
+ cmpnltpd %xmm9, %xmm4
/* Argument signs */
- movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
- movaps %xmm2, %xmm0
- movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5
- movaps %xmm3, %xmm7
- movaps %xmm3, %xmm6
+ movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
+ movaps %xmm4, %xmm0
+ movaps %xmm5, %xmm8
+ movaps %xmm5, %xmm7
/*
* 1) If y<x then a= y, b=x, PIO2=0
* 2) If y>x then a=-x, b=y, PIO2=Pi/2
*/
- orps %xmm1, %xmm3
- movaps %xmm2, %xmm10
- andps %xmm2, %xmm5
- andnps %xmm4, %xmm0
- andps %xmm2, %xmm3
- andnps %xmm1, %xmm10
- andps %xmm4, %xmm2
- orps %xmm3, %xmm0
- orps %xmm2, %xmm10
- divpd %xmm10, %xmm0
- movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
-
-/* if x<0, dPI = Pi, else dPI =0 */
- movaps %xmm9, %xmm3
+ orps %xmm9, %xmm5
+ andnps %xmm1, %xmm0
+ andps %xmm4, %xmm5
+ andps %xmm11, %xmm8
+ movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6
+ orps %xmm5, %xmm0
+ movaps %xmm4, %xmm5
+ andps %xmm4, %xmm6
+ andnps %xmm9, %xmm5
+ andps %xmm1, %xmm4
+ orps %xmm4, %xmm5
+ andps %xmm10, %xmm7
+ divpd %xmm5, %xmm0
+ movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
+ xorl %edx, %edx
/* Check if y and x are on main path. */
- pshufd $221, %xmm1, %xmm12
- andps %xmm9, %xmm7
- psubd %xmm11, %xmm12
- andps %xmm8, %xmm6
- movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
- xorl %edx, %edx
- movups %xmm4, 16(%rsp)
+ pshufd $221, %xmm9, %xmm3
xorl %eax, %eax
- pshufd $221, %xmm4, %xmm14
- movdqa %xmm12, %xmm4
- pcmpgtd %xmm13, %xmm4
- pcmpeqd %xmm13, %xmm12
- por %xmm12, %xmm4
+ pshufd $221, %xmm1, %xmm13
+ psubd %xmm2, %xmm3
+ psubd %xmm2, %xmm13
+ movdqa %xmm3, %xmm4
+ movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
+ movdqa %xmm13, %xmm14
+ pcmpgtd %xmm12, %xmm4
+ pcmpeqd %xmm12, %xmm3
+ pcmpgtd %xmm12, %xmm14
+ pcmpeqd %xmm12, %xmm13
/* Polynomial. */
movaps %xmm0, %xmm12
+ por %xmm3, %xmm4
mulpd %xmm0, %xmm12
- cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3
- psubd %xmm11, %xmm14
- movdqa %xmm14, %xmm15
- pcmpeqd %xmm13, %xmm14
- pcmpgtd %xmm13, %xmm15
- por %xmm14, %xmm15
- movaps %xmm12, %xmm14
- mulpd %xmm12, %xmm14
- por %xmm15, %xmm4
- movaps %xmm14, %xmm15
- mulpd %xmm14, %xmm15
- movmskps %xmm4, %ecx
- movups %xmm10, (%rsp)
- movups dA19+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm10
- movups dA18+__svml_datan2_data_internal(%rip), %xmm13
- movups dA17+__svml_datan2_data_internal(%rip), %xmm11
- addpd dA15+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm13
- mulpd %xmm15, %xmm11
- mulpd %xmm15, %xmm10
- addpd dA14+__svml_datan2_data_internal(%rip), %xmm13
- addpd dA13+__svml_datan2_data_internal(%rip), %xmm11
- addpd dA11+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm13
- mulpd %xmm15, %xmm11
- mulpd %xmm15, %xmm10
- addpd dA10+__svml_datan2_data_internal(%rip), %xmm13
- addpd dA09+__svml_datan2_data_internal(%rip), %xmm11
- addpd dA07+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm13
- mulpd %xmm15, %xmm11
- mulpd %xmm15, %xmm10
- addpd dA06+__svml_datan2_data_internal(%rip), %xmm13
- addpd dA05+__svml_datan2_data_internal(%rip), %xmm11
- addpd dA03+__svml_datan2_data_internal(%rip), %xmm10
- mulpd %xmm15, %xmm13
- mulpd %xmm15, %xmm11
- mulpd %xmm12, %xmm10
- addpd dA02+__svml_datan2_data_internal(%rip), %xmm13
- addpd dA01+__svml_datan2_data_internal(%rip), %xmm11
- addpd %xmm10, %xmm13
- mulpd %xmm11, %xmm12
- mulpd %xmm13, %xmm14
- movups dA16+__svml_datan2_data_internal(%rip), %xmm2
- mulpd %xmm15, %xmm2
- addpd dA12+__svml_datan2_data_internal(%rip), %xmm2
- mulpd %xmm15, %xmm2
- addpd dA08+__svml_datan2_data_internal(%rip), %xmm2
- mulpd %xmm15, %xmm2
- addpd dA04+__svml_datan2_data_internal(%rip), %xmm2
-
-/* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */
- mulpd %xmm2, %xmm15
- addpd %xmm12, %xmm15
- addpd %xmm14, %xmm15
+
+/* P = A19*R2 + A18 */
+ movups dA19+__svml_datan2_data_internal(%rip), %xmm15
+ movaps %xmm11, %xmm2
+ mulpd %xmm12, %xmm15
+ addpd dA18+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A17 */
+ mulpd %xmm12, %xmm15
+ addpd dA17+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A16 */
+ mulpd %xmm12, %xmm15
+ addpd dA16+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A15 */
+ mulpd %xmm12, %xmm15
+ addpd dA15+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A14 */
+ mulpd %xmm12, %xmm15
+ addpd dA14+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A13 */
+ mulpd %xmm12, %xmm15
+ addpd dA13+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A12 */
+ mulpd %xmm12, %xmm15
+ addpd dA12+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A11 */
+ mulpd %xmm12, %xmm15
+ addpd dA11+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A10 */
+ mulpd %xmm12, %xmm15
+ addpd dA10+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A09 */
+ mulpd %xmm12, %xmm15
+ addpd dA09+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A08 */
+ mulpd %xmm12, %xmm15
+ addpd dA08+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A07 */
+ mulpd %xmm12, %xmm15
+ addpd dA07+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A06 */
+ mulpd %xmm12, %xmm15
+ addpd dA06+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A05 */
+ mulpd %xmm12, %xmm15
+ addpd dA05+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A04 */
+ mulpd %xmm12, %xmm15
+ addpd dA04+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A03 */
+ mulpd %xmm12, %xmm15
+ addpd dA03+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A02 */
+ mulpd %xmm12, %xmm15
+ addpd dA02+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A01 */
+ mulpd %xmm12, %xmm15
+ addpd dA01+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 */
+ mulpd %xmm15, %xmm12
/*
* Reconstruction.
* dP=(R+R*dP) + dPIO2
*/
- mulpd %xmm0, %xmm15
- addpd %xmm15, %xmm0
- addpd %xmm5, %xmm0
- andps __svml_datan2_data_internal(%rip), %xmm3
+ mulpd %xmm0, %xmm12
+ addpd %xmm12, %xmm0
+
+/* if x<0, dPI = Pi, else dPI =0 */
+ movups dZERO+__svml_datan2_data_internal(%rip), %xmm3
+ por %xmm13, %xmm14
+ cmplepd %xmm3, %xmm2
+ addpd %xmm6, %xmm0
+ andps __svml_datan2_data_internal(%rip), %xmm2
+ orps %xmm8, %xmm0
+ addpd %xmm2, %xmm0
+ por %xmm14, %xmm4
orps %xmm7, %xmm0
- addpd %xmm3, %xmm0
+ movmskps %xmm4, %ecx
/* Special branch for fast (vector) processing of zero arguments */
- movups 16(%rsp), %xmm11
- orps %xmm6, %xmm0
testb $3, %cl
/* Go to auxilary branch */
jne L(AUX_BRANCH)
- # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
+ # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
/* Return from auxilary branch
* for out of main path inputs
@@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
- # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+ # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
/* Restore registers
* and exit the function
@@ -237,8 +264,8 @@ L(EXIT):
*/
L(SPECIAL_VALUES_BRANCH):
- movups %xmm8, 32(%rsp)
- movups %xmm9, 48(%rsp)
+ movups %xmm10, 32(%rsp)
+ movups %xmm11, 48(%rsp)
movups %xmm0, 64(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
@@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
*/
L(AUX_BRANCH):
-/* Check if at least on of Y or Y is zero: iAXAYZERO */
- movups dZERO+__svml_datan2_data_internal(%rip), %xmm2
-
/* Check if both X & Y are not NaNs: iXYnotNAN */
- movaps %xmm9, %xmm12
- movaps %xmm8, %xmm10
- cmpordpd %xmm9, %xmm12
- cmpordpd %xmm8, %xmm10
- cmpeqpd %xmm2, %xmm1
- cmpeqpd %xmm2, %xmm11
- andps %xmm10, %xmm12
- orps %xmm11, %xmm1
- pshufd $221, %xmm1, %xmm1
- pshufd $221, %xmm12, %xmm11
+ movaps %xmm11, %xmm13
+ movaps %xmm10, %xmm12
+ cmpordpd %xmm11, %xmm13
+ cmpordpd %xmm10, %xmm12
-/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
- pand %xmm11, %xmm1
-
-/* Exclude from previous callout mask zero (and not NaN) arguments */
- movdqa %xmm1, %xmm13
- pandn %xmm4, %xmm13
+/* Check if at least on of Y or Y is zero: iAXAYZERO */
+ cmpeqpd %xmm3, %xmm9
+ cmpeqpd %xmm3, %xmm1
/*
* Path for zero arguments (at least one of both)
* Check if both args are zeros (den. is zero)
*/
- movups (%rsp), %xmm4
- cmpeqpd %xmm2, %xmm4
+ cmpeqpd %xmm3, %xmm5
+ andps %xmm12, %xmm13
+ orps %xmm1, %xmm9
+ pshufd $221, %xmm9, %xmm1
+ pshufd $221, %xmm13, %xmm9
-/* Go to callout */
- movmskps %xmm13, %edx
+/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
+ pand %xmm9, %xmm1
+
+/* Exclude from previous callout mask zero (and not NaN) arguments */
+ movdqa %xmm1, %xmm14
+ pandn %xmm4, %xmm14
/* Set sPIO2 to zero if den. is zero */
- movaps %xmm4, %xmm15
- andps %xmm2, %xmm4
- andnps %xmm5, %xmm15
- andl $3, %edx
- orps %xmm4, %xmm15
- pshufd $221, %xmm9, %xmm5
- orps %xmm7, %xmm15
+ movaps %xmm5, %xmm4
+ andnps %xmm6, %xmm4
+ andps %xmm3, %xmm5
/* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
- pshufd $221, %xmm2, %xmm7
- pcmpgtd %xmm5, %xmm7
- pshufd $80, %xmm7, %xmm14
- andps %xmm3, %xmm14
- addpd %xmm14, %xmm15
+ pshufd $221, %xmm3, %xmm3
+ orps %xmm5, %xmm4
+ pshufd $221, %xmm11, %xmm5
+ orps %xmm8, %xmm4
+ pcmpgtd %xmm5, %xmm3
+ pshufd $80, %xmm3, %xmm6
+ andps %xmm2, %xmm6
+ addpd %xmm6, %xmm4
+
+/* Go to callout */
+ movmskps %xmm14, %edx
/* Merge results from main and spec path */
- pshufd $80, %xmm1, %xmm3
- orps %xmm6, %xmm15
- movdqa %xmm3, %xmm6
- andps %xmm3, %xmm15
- andnps %xmm0, %xmm6
- movaps %xmm6, %xmm0
- orps %xmm15, %xmm0
+ pshufd $80, %xmm1, %xmm2
+ orps %xmm7, %xmm4
+ movdqa %xmm2, %xmm7
+ andps %xmm2, %xmm4
+ andnps %xmm0, %xmm7
+ andl $3, %edx
+ movaps %xmm7, %xmm0
+ orps %xmm4, %xmm0
/* Return to main vector processing path */
jmp L(AUX_BRANCH_RETURN)
- # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+ # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
END(_ZGVbN2vv_atan2_sse4)
.section .rodata, "a"