x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]

Message ID 20220112204323.3385056-1-skpgkp2@gmail.com
State Committed
Commit 49e2bf58d57758df244eb621d63cedd2ab6d1971
Headers
Series x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765] |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Sunil Pandey Jan. 12, 2022, 8:43 p.m. UTC
  This patch fixes SSE4.2 libmvec atan2 function accuracy for following
inputs to less than 4 ulps.

{0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54}   4.19888 ulps
{0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps

This fixes BZ #28765.
---
 .../fpu/multiarch/svml_d_atan22_core_sse4.S   | 321 ++++++++++--------
 1 file changed, 173 insertions(+), 148 deletions(-)
  

Comments

H.J. Lu Jan. 12, 2022, 10:23 p.m. UTC | #1
On Wed, Jan 12, 2022 at 12:43 PM Sunil K Pandey <skpgkp2@gmail.com> wrote:
>
> This patch fixes SSE4.2 libmvec atan2 function accuracy for following
> inputs to less than 4 ulps.
>
> {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54}   4.19888 ulps
> {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps
>
> This fixes BZ #28765.
> ---
>  .../fpu/multiarch/svml_d_atan22_core_sse4.S   | 321 ++++++++++--------
>  1 file changed, 173 insertions(+), 148 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
> index 4983051323..138ff2ffa0 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
> @@ -65,7 +65,7 @@
>  ENTRY(_ZGVbN2vv_atan2_sse4)
>          subq      $88, %rsp
>          cfi_def_cfa_offset(96)
> -        movaps    %xmm0, %xmm8
> +        movaps    %xmm1, %xmm11
>
>  /*
>   * #define NO_VECTOR_ZERO_ATAN2_ARGS
> @@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
>   * Cannot be replaced by VQRCP(D, dR0, dB);
>   * Argument Absolute values
>   */
> -        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
> +        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
> +        movaps    %xmm0, %xmm10
>          movaps    %xmm1, %xmm9
> -        movaps    %xmm4, %xmm1
> -        andps     %xmm8, %xmm4
> -        andps     %xmm9, %xmm1
> -        movaps    %xmm4, %xmm2
> -        cmpnltpd  %xmm1, %xmm2
> +        andps     %xmm10, %xmm1
> +        andps     %xmm11, %xmm9
> +        movaps    %xmm1, %xmm4
> +        cmpnltpd  %xmm9, %xmm4
>
>  /* Argument signs */
> -        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
> -        movaps    %xmm2, %xmm0
> -        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm5
> -        movaps    %xmm3, %xmm7
> -        movaps    %xmm3, %xmm6
> +        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
> +        movaps    %xmm4, %xmm0
> +        movaps    %xmm5, %xmm8
> +        movaps    %xmm5, %xmm7
>
>  /*
>   * 1) If y<x then a= y, b=x, PIO2=0
>   * 2) If y>x then a=-x, b=y, PIO2=Pi/2
>   */
> -        orps      %xmm1, %xmm3
> -        movaps    %xmm2, %xmm10
> -        andps     %xmm2, %xmm5
> -        andnps    %xmm4, %xmm0
> -        andps     %xmm2, %xmm3
> -        andnps    %xmm1, %xmm10
> -        andps     %xmm4, %xmm2
> -        orps      %xmm3, %xmm0
> -        orps      %xmm2, %xmm10
> -        divpd     %xmm10, %xmm0
> -        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
> -
> -/* if x<0, dPI = Pi, else dPI =0 */
> -        movaps    %xmm9, %xmm3
> +        orps      %xmm9, %xmm5
> +        andnps    %xmm1, %xmm0
> +        andps     %xmm4, %xmm5
> +        andps     %xmm11, %xmm8
> +        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm6
> +        orps      %xmm5, %xmm0
> +        movaps    %xmm4, %xmm5
> +        andps     %xmm4, %xmm6
> +        andnps    %xmm9, %xmm5
> +        andps     %xmm1, %xmm4
> +        orps      %xmm4, %xmm5
> +        andps     %xmm10, %xmm7
> +        divpd     %xmm5, %xmm0
> +        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
> +        xorl      %edx, %edx
>
>  /* Check if y and x are on main path. */
> -        pshufd    $221, %xmm1, %xmm12
> -        andps     %xmm9, %xmm7
> -        psubd     %xmm11, %xmm12
> -        andps     %xmm8, %xmm6
> -        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
> -        xorl      %edx, %edx
> -        movups    %xmm4, 16(%rsp)
> +        pshufd    $221, %xmm9, %xmm3
>          xorl      %eax, %eax
> -        pshufd    $221, %xmm4, %xmm14
> -        movdqa    %xmm12, %xmm4
> -        pcmpgtd   %xmm13, %xmm4
> -        pcmpeqd   %xmm13, %xmm12
> -        por       %xmm12, %xmm4
> +        pshufd    $221, %xmm1, %xmm13
> +        psubd     %xmm2, %xmm3
> +        psubd     %xmm2, %xmm13
> +        movdqa    %xmm3, %xmm4
> +        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
> +        movdqa    %xmm13, %xmm14
> +        pcmpgtd   %xmm12, %xmm4
> +        pcmpeqd   %xmm12, %xmm3
> +        pcmpgtd   %xmm12, %xmm14
> +        pcmpeqd   %xmm12, %xmm13
>
>  /* Polynomial. */
>          movaps    %xmm0, %xmm12
> +        por       %xmm3, %xmm4
>          mulpd     %xmm0, %xmm12
> -        cmplepd   dZERO+__svml_datan2_data_internal(%rip), %xmm3
> -        psubd     %xmm11, %xmm14
> -        movdqa    %xmm14, %xmm15
> -        pcmpeqd   %xmm13, %xmm14
> -        pcmpgtd   %xmm13, %xmm15
> -        por       %xmm14, %xmm15
> -        movaps    %xmm12, %xmm14
> -        mulpd     %xmm12, %xmm14
> -        por       %xmm15, %xmm4
> -        movaps    %xmm14, %xmm15
> -        mulpd     %xmm14, %xmm15
> -        movmskps  %xmm4, %ecx
> -        movups    %xmm10, (%rsp)
> -        movups    dA19+__svml_datan2_data_internal(%rip), %xmm10
> -        mulpd     %xmm15, %xmm10
> -        movups    dA18+__svml_datan2_data_internal(%rip), %xmm13
> -        movups    dA17+__svml_datan2_data_internal(%rip), %xmm11
> -        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm10
> -        mulpd     %xmm15, %xmm13
> -        mulpd     %xmm15, %xmm11
> -        mulpd     %xmm15, %xmm10
> -        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm13
> -        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm11
> -        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm10
> -        mulpd     %xmm15, %xmm13
> -        mulpd     %xmm15, %xmm11
> -        mulpd     %xmm15, %xmm10
> -        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm13
> -        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm11
> -        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm10
> -        mulpd     %xmm15, %xmm13
> -        mulpd     %xmm15, %xmm11
> -        mulpd     %xmm15, %xmm10
> -        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm13
> -        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm11
> -        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm10
> -        mulpd     %xmm15, %xmm13
> -        mulpd     %xmm15, %xmm11
> -        mulpd     %xmm12, %xmm10
> -        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm13
> -        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm11
> -        addpd     %xmm10, %xmm13
> -        mulpd     %xmm11, %xmm12
> -        mulpd     %xmm13, %xmm14
> -        movups    dA16+__svml_datan2_data_internal(%rip), %xmm2
> -        mulpd     %xmm15, %xmm2
> -        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm2
> -        mulpd     %xmm15, %xmm2
> -        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm2
> -        mulpd     %xmm15, %xmm2
> -        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm2
> -
> -/* A00=1.0, account for it later  VQFMA(D, dP4, dP4, dR8, dA00); */
> -        mulpd     %xmm2, %xmm15
> -        addpd     %xmm12, %xmm15
> -        addpd     %xmm14, %xmm15
> +
> +/* P = A19*R2 + A18 */
> +        movups    dA19+__svml_datan2_data_internal(%rip), %xmm15
> +        movaps    %xmm11, %xmm2
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA18+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A17 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA17+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A16 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA16+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A15 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A14 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A13 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A12 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A11 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A10 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A09 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A08 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A07 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A06 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A05 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A04 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A03 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A02 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 + A01 */
> +        mulpd     %xmm12, %xmm15
> +        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm15
> +
> +/* P = P*R2 */
> +        mulpd     %xmm15, %xmm12
>
>  /*
>   * Reconstruction.
>   * dP=(R+R*dP) + dPIO2
>   */
> -        mulpd     %xmm0, %xmm15
> -        addpd     %xmm15, %xmm0
> -        addpd     %xmm5, %xmm0
> -        andps     __svml_datan2_data_internal(%rip), %xmm3
> +        mulpd     %xmm0, %xmm12
> +        addpd     %xmm12, %xmm0
> +
> +/* if x<0, dPI = Pi, else dPI =0 */
> +        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm3
> +        por       %xmm13, %xmm14
> +        cmplepd   %xmm3, %xmm2
> +        addpd     %xmm6, %xmm0
> +        andps     __svml_datan2_data_internal(%rip), %xmm2
> +        orps      %xmm8, %xmm0
> +        addpd     %xmm2, %xmm0
> +        por       %xmm14, %xmm4
>          orps      %xmm7, %xmm0
> -        addpd     %xmm3, %xmm0
> +        movmskps  %xmm4, %ecx
>
>  /*  Special branch for fast (vector) processing of zero arguments  */
> -        movups    16(%rsp), %xmm11
> -        orps      %xmm6, %xmm0
>          testb     $3, %cl
>
>  /* Go to auxilary branch */
>          jne       L(AUX_BRANCH)
> -                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
> +                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
>
>  /* Return from auxilary branch
>   * for out of main path inputs
> @@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
>
>  /* Go to special inputs processing branch */
>          jne       L(SPECIAL_VALUES_BRANCH)
> -                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
> +                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
>
>  /* Restore registers
>   * and exit the function
> @@ -237,8 +264,8 @@ L(EXIT):
>   */
>
>  L(SPECIAL_VALUES_BRANCH):
> -        movups    %xmm8, 32(%rsp)
> -        movups    %xmm9, 48(%rsp)
> +        movups    %xmm10, 32(%rsp)
> +        movups    %xmm11, 48(%rsp)
>          movups    %xmm0, 64(%rsp)
>                                  # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
>
> @@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
>   */
>
>  L(AUX_BRANCH):
> -/* Check if at least on of Y or Y is zero: iAXAYZERO */
> -        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm2
> -
>  /* Check if both X & Y are not NaNs:  iXYnotNAN */
> -        movaps    %xmm9, %xmm12
> -        movaps    %xmm8, %xmm10
> -        cmpordpd  %xmm9, %xmm12
> -        cmpordpd  %xmm8, %xmm10
> -        cmpeqpd   %xmm2, %xmm1
> -        cmpeqpd   %xmm2, %xmm11
> -        andps     %xmm10, %xmm12
> -        orps      %xmm11, %xmm1
> -        pshufd    $221, %xmm1, %xmm1
> -        pshufd    $221, %xmm12, %xmm11
> +        movaps    %xmm11, %xmm13
> +        movaps    %xmm10, %xmm12
> +        cmpordpd  %xmm11, %xmm13
> +        cmpordpd  %xmm10, %xmm12
>
> -/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
> -        pand      %xmm11, %xmm1
> -
> -/* Exclude from previous callout mask zero (and not NaN) arguments */
> -        movdqa    %xmm1, %xmm13
> -        pandn     %xmm4, %xmm13
> +/* Check if at least on of Y or Y is zero: iAXAYZERO */
> +        cmpeqpd   %xmm3, %xmm9
> +        cmpeqpd   %xmm3, %xmm1
>
>  /*
>   *  Path for zero arguments (at least one of both)
>   * Check if both args are zeros (den. is zero)
>   */
> -        movups    (%rsp), %xmm4
> -        cmpeqpd   %xmm2, %xmm4
> +        cmpeqpd   %xmm3, %xmm5
> +        andps     %xmm12, %xmm13
> +        orps      %xmm1, %xmm9
> +        pshufd    $221, %xmm9, %xmm1
> +        pshufd    $221, %xmm13, %xmm9
>
> -/* Go to callout */
> -        movmskps  %xmm13, %edx
> +/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
> +        pand      %xmm9, %xmm1
> +
> +/* Exclude from previous callout mask zero (and not NaN) arguments */
> +        movdqa    %xmm1, %xmm14
> +        pandn     %xmm4, %xmm14
>
>  /* Set sPIO2 to zero if den. is zero */
> -        movaps    %xmm4, %xmm15
> -        andps     %xmm2, %xmm4
> -        andnps    %xmm5, %xmm15
> -        andl      $3, %edx
> -        orps      %xmm4, %xmm15
> -        pshufd    $221, %xmm9, %xmm5
> -        orps      %xmm7, %xmm15
> +        movaps    %xmm5, %xmm4
> +        andnps    %xmm6, %xmm4
> +        andps     %xmm3, %xmm5
>
>  /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
> -        pshufd    $221, %xmm2, %xmm7
> -        pcmpgtd   %xmm5, %xmm7
> -        pshufd    $80, %xmm7, %xmm14
> -        andps     %xmm3, %xmm14
> -        addpd     %xmm14, %xmm15
> +        pshufd    $221, %xmm3, %xmm3
> +        orps      %xmm5, %xmm4
> +        pshufd    $221, %xmm11, %xmm5
> +        orps      %xmm8, %xmm4
> +        pcmpgtd   %xmm5, %xmm3
> +        pshufd    $80, %xmm3, %xmm6
> +        andps     %xmm2, %xmm6
> +        addpd     %xmm6, %xmm4
> +
> +/* Go to callout */
> +        movmskps  %xmm14, %edx
>
>  /* Merge results from main and spec path */
> -        pshufd    $80, %xmm1, %xmm3
> -        orps      %xmm6, %xmm15
> -        movdqa    %xmm3, %xmm6
> -        andps     %xmm3, %xmm15
> -        andnps    %xmm0, %xmm6
> -        movaps    %xmm6, %xmm0
> -        orps      %xmm15, %xmm0
> +        pshufd    $80, %xmm1, %xmm2
> +        orps      %xmm7, %xmm4
> +        movdqa    %xmm2, %xmm7
> +        andps     %xmm2, %xmm4
> +        andnps    %xmm0, %xmm7
> +        andl      $3, %edx
> +        movaps    %xmm7, %xmm0
> +        orps      %xmm4, %xmm0
>
>  /* Return to main vector processing path */
>          jmp       L(AUX_BRANCH_RETURN)
> -                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
> +                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
>  END(_ZGVbN2vv_atan2_sse4)
>
>          .section .rodata, "a"
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
index 4983051323..138ff2ffa0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
@@ -65,7 +65,7 @@ 
 ENTRY(_ZGVbN2vv_atan2_sse4)
         subq      $88, %rsp
         cfi_def_cfa_offset(96)
-        movaps    %xmm0, %xmm8
+        movaps    %xmm1, %xmm11
 
 /*
  * #define NO_VECTOR_ZERO_ATAN2_ARGS
@@ -78,134 +78,161 @@  ENTRY(_ZGVbN2vv_atan2_sse4)
  * Cannot be replaced by VQRCP(D, dR0, dB);
  * Argument Absolute values
  */
-        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
+        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
+        movaps    %xmm0, %xmm10
         movaps    %xmm1, %xmm9
-        movaps    %xmm4, %xmm1
-        andps     %xmm8, %xmm4
-        andps     %xmm9, %xmm1
-        movaps    %xmm4, %xmm2
-        cmpnltpd  %xmm1, %xmm2
+        andps     %xmm10, %xmm1
+        andps     %xmm11, %xmm9
+        movaps    %xmm1, %xmm4
+        cmpnltpd  %xmm9, %xmm4
 
 /* Argument signs */
-        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
-        movaps    %xmm2, %xmm0
-        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm5
-        movaps    %xmm3, %xmm7
-        movaps    %xmm3, %xmm6
+        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
+        movaps    %xmm4, %xmm0
+        movaps    %xmm5, %xmm8
+        movaps    %xmm5, %xmm7
 
 /*
  * 1) If y<x then a= y, b=x, PIO2=0
  * 2) If y>x then a=-x, b=y, PIO2=Pi/2
  */
-        orps      %xmm1, %xmm3
-        movaps    %xmm2, %xmm10
-        andps     %xmm2, %xmm5
-        andnps    %xmm4, %xmm0
-        andps     %xmm2, %xmm3
-        andnps    %xmm1, %xmm10
-        andps     %xmm4, %xmm2
-        orps      %xmm3, %xmm0
-        orps      %xmm2, %xmm10
-        divpd     %xmm10, %xmm0
-        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
-
-/* if x<0, dPI = Pi, else dPI =0 */
-        movaps    %xmm9, %xmm3
+        orps      %xmm9, %xmm5
+        andnps    %xmm1, %xmm0
+        andps     %xmm4, %xmm5
+        andps     %xmm11, %xmm8
+        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm6
+        orps      %xmm5, %xmm0
+        movaps    %xmm4, %xmm5
+        andps     %xmm4, %xmm6
+        andnps    %xmm9, %xmm5
+        andps     %xmm1, %xmm4
+        orps      %xmm4, %xmm5
+        andps     %xmm10, %xmm7
+        divpd     %xmm5, %xmm0
+        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
+        xorl      %edx, %edx
 
 /* Check if y and x are on main path. */
-        pshufd    $221, %xmm1, %xmm12
-        andps     %xmm9, %xmm7
-        psubd     %xmm11, %xmm12
-        andps     %xmm8, %xmm6
-        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
-        xorl      %edx, %edx
-        movups    %xmm4, 16(%rsp)
+        pshufd    $221, %xmm9, %xmm3
         xorl      %eax, %eax
-        pshufd    $221, %xmm4, %xmm14
-        movdqa    %xmm12, %xmm4
-        pcmpgtd   %xmm13, %xmm4
-        pcmpeqd   %xmm13, %xmm12
-        por       %xmm12, %xmm4
+        pshufd    $221, %xmm1, %xmm13
+        psubd     %xmm2, %xmm3
+        psubd     %xmm2, %xmm13
+        movdqa    %xmm3, %xmm4
+        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
+        movdqa    %xmm13, %xmm14
+        pcmpgtd   %xmm12, %xmm4
+        pcmpeqd   %xmm12, %xmm3
+        pcmpgtd   %xmm12, %xmm14
+        pcmpeqd   %xmm12, %xmm13
 
 /* Polynomial. */
         movaps    %xmm0, %xmm12
+        por       %xmm3, %xmm4
         mulpd     %xmm0, %xmm12
-        cmplepd   dZERO+__svml_datan2_data_internal(%rip), %xmm3
-        psubd     %xmm11, %xmm14
-        movdqa    %xmm14, %xmm15
-        pcmpeqd   %xmm13, %xmm14
-        pcmpgtd   %xmm13, %xmm15
-        por       %xmm14, %xmm15
-        movaps    %xmm12, %xmm14
-        mulpd     %xmm12, %xmm14
-        por       %xmm15, %xmm4
-        movaps    %xmm14, %xmm15
-        mulpd     %xmm14, %xmm15
-        movmskps  %xmm4, %ecx
-        movups    %xmm10, (%rsp)
-        movups    dA19+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm10
-        movups    dA18+__svml_datan2_data_internal(%rip), %xmm13
-        movups    dA17+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm12, %xmm10
-        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     %xmm10, %xmm13
-        mulpd     %xmm11, %xmm12
-        mulpd     %xmm13, %xmm14
-        movups    dA16+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm2
-
-/* A00=1.0, account for it later  VQFMA(D, dP4, dP4, dR8, dA00); */
-        mulpd     %xmm2, %xmm15
-        addpd     %xmm12, %xmm15
-        addpd     %xmm14, %xmm15
+
+/* P = A19*R2 + A18 */
+        movups    dA19+__svml_datan2_data_internal(%rip), %xmm15
+        movaps    %xmm11, %xmm2
+        mulpd     %xmm12, %xmm15
+        addpd     dA18+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A17 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA17+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A16 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA16+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A15 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A14 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A13 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A12 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A11 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A10 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A09 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A08 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A07 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A06 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A05 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A04 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A03 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A02 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A01 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 */
+        mulpd     %xmm15, %xmm12
 
 /*
  * Reconstruction.
  * dP=(R+R*dP) + dPIO2
  */
-        mulpd     %xmm0, %xmm15
-        addpd     %xmm15, %xmm0
-        addpd     %xmm5, %xmm0
-        andps     __svml_datan2_data_internal(%rip), %xmm3
+        mulpd     %xmm0, %xmm12
+        addpd     %xmm12, %xmm0
+
+/* if x<0, dPI = Pi, else dPI =0 */
+        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm3
+        por       %xmm13, %xmm14
+        cmplepd   %xmm3, %xmm2
+        addpd     %xmm6, %xmm0
+        andps     __svml_datan2_data_internal(%rip), %xmm2
+        orps      %xmm8, %xmm0
+        addpd     %xmm2, %xmm0
+        por       %xmm14, %xmm4
         orps      %xmm7, %xmm0
-        addpd     %xmm3, %xmm0
+        movmskps  %xmm4, %ecx
 
 /*  Special branch for fast (vector) processing of zero arguments  */
-        movups    16(%rsp), %xmm11
-        orps      %xmm6, %xmm0
         testb     $3, %cl
 
 /* Go to auxilary branch */
         jne       L(AUX_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
 
 /* Return from auxilary branch
  * for out of main path inputs
@@ -220,7 +247,7 @@  L(AUX_BRANCH_RETURN):
 
 /* Go to special inputs processing branch */
         jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
 
 /* Restore registers
  * and exit the function
@@ -237,8 +264,8 @@  L(EXIT):
  */
 
 L(SPECIAL_VALUES_BRANCH):
-        movups    %xmm8, 32(%rsp)
-        movups    %xmm9, 48(%rsp)
+        movups    %xmm10, 32(%rsp)
+        movups    %xmm11, 48(%rsp)
         movups    %xmm0, 64(%rsp)
                                 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
 
@@ -315,66 +342,64 @@  L(SCALAR_MATH_CALL):
  */
 
 L(AUX_BRANCH):
-/* Check if at least on of Y or Y is zero: iAXAYZERO */
-        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm2
-
 /* Check if both X & Y are not NaNs:  iXYnotNAN */
-        movaps    %xmm9, %xmm12
-        movaps    %xmm8, %xmm10
-        cmpordpd  %xmm9, %xmm12
-        cmpordpd  %xmm8, %xmm10
-        cmpeqpd   %xmm2, %xmm1
-        cmpeqpd   %xmm2, %xmm11
-        andps     %xmm10, %xmm12
-        orps      %xmm11, %xmm1
-        pshufd    $221, %xmm1, %xmm1
-        pshufd    $221, %xmm12, %xmm11
+        movaps    %xmm11, %xmm13
+        movaps    %xmm10, %xmm12
+        cmpordpd  %xmm11, %xmm13
+        cmpordpd  %xmm10, %xmm12
 
-/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
-        pand      %xmm11, %xmm1
-
-/* Exclude from previous callout mask zero (and not NaN) arguments */
-        movdqa    %xmm1, %xmm13
-        pandn     %xmm4, %xmm13
+/* Check if at least on of Y or Y is zero: iAXAYZERO */
+        cmpeqpd   %xmm3, %xmm9
+        cmpeqpd   %xmm3, %xmm1
 
 /*
  *  Path for zero arguments (at least one of both)
  * Check if both args are zeros (den. is zero)
  */
-        movups    (%rsp), %xmm4
-        cmpeqpd   %xmm2, %xmm4
+        cmpeqpd   %xmm3, %xmm5
+        andps     %xmm12, %xmm13
+        orps      %xmm1, %xmm9
+        pshufd    $221, %xmm9, %xmm1
+        pshufd    $221, %xmm13, %xmm9
 
-/* Go to callout */
-        movmskps  %xmm13, %edx
+/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
+        pand      %xmm9, %xmm1
+
+/* Exclude from previous callout mask zero (and not NaN) arguments */
+        movdqa    %xmm1, %xmm14
+        pandn     %xmm4, %xmm14
 
 /* Set sPIO2 to zero if den. is zero */
-        movaps    %xmm4, %xmm15
-        andps     %xmm2, %xmm4
-        andnps    %xmm5, %xmm15
-        andl      $3, %edx
-        orps      %xmm4, %xmm15
-        pshufd    $221, %xmm9, %xmm5
-        orps      %xmm7, %xmm15
+        movaps    %xmm5, %xmm4
+        andnps    %xmm6, %xmm4
+        andps     %xmm3, %xmm5
 
 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
-        pshufd    $221, %xmm2, %xmm7
-        pcmpgtd   %xmm5, %xmm7
-        pshufd    $80, %xmm7, %xmm14
-        andps     %xmm3, %xmm14
-        addpd     %xmm14, %xmm15
+        pshufd    $221, %xmm3, %xmm3
+        orps      %xmm5, %xmm4
+        pshufd    $221, %xmm11, %xmm5
+        orps      %xmm8, %xmm4
+        pcmpgtd   %xmm5, %xmm3
+        pshufd    $80, %xmm3, %xmm6
+        andps     %xmm2, %xmm6
+        addpd     %xmm6, %xmm4
+
+/* Go to callout */
+        movmskps  %xmm14, %edx
 
 /* Merge results from main and spec path */
-        pshufd    $80, %xmm1, %xmm3
-        orps      %xmm6, %xmm15
-        movdqa    %xmm3, %xmm6
-        andps     %xmm3, %xmm15
-        andnps    %xmm0, %xmm6
-        movaps    %xmm6, %xmm0
-        orps      %xmm15, %xmm0
+        pshufd    $80, %xmm1, %xmm2
+        orps      %xmm7, %xmm4
+        movdqa    %xmm2, %xmm7
+        andps     %xmm2, %xmm4
+        andnps    %xmm0, %xmm7
+        andl      $3, %edx
+        movaps    %xmm7, %xmm0
+        orps      %xmm4, %xmm0
 
 /* Return to main vector processing path */
         jmp       L(AUX_BRANCH_RETURN)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
 END(_ZGVbN2vv_atan2_sse4)
 
         .section .rodata, "a"