From patchwork Wed Jan 12 20:43:23 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Sunil Pandey X-Patchwork-Id: 49939 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 97E2F39484A9 for ; Wed, 12 Jan 2022 20:43:48 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 97E2F39484A9 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1642020228; bh=xoi/QNwN4UVzG75Va7g7qpONuRa5aqjmXuhuDUQDmpc=; h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post: List-Help:List-Subscribe:From:Reply-To:From; b=lGBo6myYrsT3nbApaG+v/Sr/4wNiKonROBTIqOOzWBVAars66hXyRdZ1/2/64XvBy GPm5fCLf6BGpPNgtR2scAT4pJpa4t77VhMt9Avj209FgXkI6NuLK5uGqdzal5/mXIU XLIpx8dcUP3dMzmfmKrRYREfmt0qR/sW1pqGcTuo= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mga01.intel.com (mga01.intel.com [192.55.52.88]) by sourceware.org (Postfix) with ESMTPS id 933D03947425 for ; Wed, 12 Jan 2022 20:43:26 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 933D03947425 X-IronPort-AV: E=McAfee;i="6200,9189,10225"; a="268198143" X-IronPort-AV: E=Sophos;i="5.88,284,1635231600"; d="scan'208";a="268198143" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 12 Jan 2022 12:43:25 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.88,284,1635231600"; d="scan'208";a="620356208" Received: from scymds01.sc.intel.com ([10.148.94.138]) by fmsmga002.fm.intel.com with ESMTP; 12 Jan 2022 12:43:25 -0800 Received: from gskx-1.sc.intel.com (gskx-1.sc.intel.com [172.25.149.211]) by scymds01.sc.intel.com with ESMTP id 20CKhOVF011196; Wed, 12 Jan 2022 12:43:24 -0800 To: libc-alpha@sourceware.org Subject: [PATCH] x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765] Date: Wed, 12 Jan 2022 12:43:23 -0800 Message-Id: <20220112204323.3385056-1-skpgkp2@gmail.com> X-Mailer: git-send-email 2.34.1 MIME-Version: 1.0 X-Spam-Status: No, score=-5.8 required=5.0 tests=BAYES_00, DKIM_ADSP_CUSTOM_MED, FORGED_GMAIL_RCVD, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, HK_RANDOM_ENVFROM, HK_RANDOM_FROM, KAM_DMARC_NONE, KAM_DMARC_STATUS, NML_ADSP_CUSTOM_MED, RCVD_IN_MSPIKE_H3, RCVD_IN_MSPIKE_WL, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_SOFTFAIL, SPOOFED_FREEMAIL, SPOOF_GMAIL_MID, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Sunil K Pandey via Libc-alpha From: Sunil Pandey Reply-To: Sunil K Pandey Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This patch fixes SSE4.2 libmvec atan2 function accuracy for following inputs to less than 4 ulps. {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps This fixes BZ #28765. Reviewed-by: H.J. Lu --- .../fpu/multiarch/svml_d_atan22_core_sse4.S | 321 ++++++++++-------- 1 file changed, 173 insertions(+), 148 deletions(-) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S index 4983051323..138ff2ffa0 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S @@ -65,7 +65,7 @@ ENTRY(_ZGVbN2vv_atan2_sse4) subq $88, %rsp cfi_def_cfa_offset(96) - movaps %xmm0, %xmm8 + movaps %xmm1, %xmm11 /* * #define NO_VECTOR_ZERO_ATAN2_ARGS @@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4) * Cannot be replaced by VQRCP(D, dR0, dB); * Argument Absolute values */ - movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4 + movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1 + movaps %xmm0, %xmm10 movaps %xmm1, %xmm9 - movaps %xmm4, %xmm1 - andps %xmm8, %xmm4 - andps %xmm9, %xmm1 - movaps %xmm4, %xmm2 - cmpnltpd %xmm1, %xmm2 + andps %xmm10, %xmm1 + andps %xmm11, %xmm9 + movaps %xmm1, %xmm4 + cmpnltpd %xmm9, %xmm4 /* Argument signs */ - movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3 - movaps %xmm2, %xmm0 - movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5 - movaps %xmm3, %xmm7 - movaps %xmm3, %xmm6 + movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5 + movaps %xmm4, %xmm0 + movaps %xmm5, %xmm8 + movaps %xmm5, %xmm7 /* * 1) If yx then a=-x, b=y, PIO2=Pi/2 */ - orps %xmm1, %xmm3 - movaps %xmm2, %xmm10 - andps %xmm2, %xmm5 - andnps %xmm4, %xmm0 - andps %xmm2, %xmm3 - andnps %xmm1, %xmm10 - andps %xmm4, %xmm2 - orps %xmm3, %xmm0 - orps %xmm2, %xmm10 - divpd %xmm10, %xmm0 - movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11 - -/* if x<0, dPI = Pi, else dPI =0 */ - movaps %xmm9, %xmm3 + orps %xmm9, %xmm5 + andnps %xmm1, %xmm0 + andps %xmm4, %xmm5 + andps %xmm11, %xmm8 + movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6 + orps %xmm5, %xmm0 + movaps %xmm4, %xmm5 + andps %xmm4, %xmm6 + andnps %xmm9, %xmm5 + andps %xmm1, %xmm4 + orps %xmm4, %xmm5 + andps %xmm10, %xmm7 + divpd %xmm5, %xmm0 + movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2 + xorl %edx, %edx /* Check if y and x are on main path. */ - pshufd $221, %xmm1, %xmm12 - andps %xmm9, %xmm7 - psubd %xmm11, %xmm12 - andps %xmm8, %xmm6 - movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13 - xorl %edx, %edx - movups %xmm4, 16(%rsp) + pshufd $221, %xmm9, %xmm3 xorl %eax, %eax - pshufd $221, %xmm4, %xmm14 - movdqa %xmm12, %xmm4 - pcmpgtd %xmm13, %xmm4 - pcmpeqd %xmm13, %xmm12 - por %xmm12, %xmm4 + pshufd $221, %xmm1, %xmm13 + psubd %xmm2, %xmm3 + psubd %xmm2, %xmm13 + movdqa %xmm3, %xmm4 + movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12 + movdqa %xmm13, %xmm14 + pcmpgtd %xmm12, %xmm4 + pcmpeqd %xmm12, %xmm3 + pcmpgtd %xmm12, %xmm14 + pcmpeqd %xmm12, %xmm13 /* Polynomial. */ movaps %xmm0, %xmm12 + por %xmm3, %xmm4 mulpd %xmm0, %xmm12 - cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3 - psubd %xmm11, %xmm14 - movdqa %xmm14, %xmm15 - pcmpeqd %xmm13, %xmm14 - pcmpgtd %xmm13, %xmm15 - por %xmm14, %xmm15 - movaps %xmm12, %xmm14 - mulpd %xmm12, %xmm14 - por %xmm15, %xmm4 - movaps %xmm14, %xmm15 - mulpd %xmm14, %xmm15 - movmskps %xmm4, %ecx - movups %xmm10, (%rsp) - movups dA19+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm10 - movups dA18+__svml_datan2_data_internal(%rip), %xmm13 - movups dA17+__svml_datan2_data_internal(%rip), %xmm11 - addpd dA15+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm13 - mulpd %xmm15, %xmm11 - mulpd %xmm15, %xmm10 - addpd dA14+__svml_datan2_data_internal(%rip), %xmm13 - addpd dA13+__svml_datan2_data_internal(%rip), %xmm11 - addpd dA11+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm13 - mulpd %xmm15, %xmm11 - mulpd %xmm15, %xmm10 - addpd dA10+__svml_datan2_data_internal(%rip), %xmm13 - addpd dA09+__svml_datan2_data_internal(%rip), %xmm11 - addpd dA07+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm13 - mulpd %xmm15, %xmm11 - mulpd %xmm15, %xmm10 - addpd dA06+__svml_datan2_data_internal(%rip), %xmm13 - addpd dA05+__svml_datan2_data_internal(%rip), %xmm11 - addpd dA03+__svml_datan2_data_internal(%rip), %xmm10 - mulpd %xmm15, %xmm13 - mulpd %xmm15, %xmm11 - mulpd %xmm12, %xmm10 - addpd dA02+__svml_datan2_data_internal(%rip), %xmm13 - addpd dA01+__svml_datan2_data_internal(%rip), %xmm11 - addpd %xmm10, %xmm13 - mulpd %xmm11, %xmm12 - mulpd %xmm13, %xmm14 - movups dA16+__svml_datan2_data_internal(%rip), %xmm2 - mulpd %xmm15, %xmm2 - addpd dA12+__svml_datan2_data_internal(%rip), %xmm2 - mulpd %xmm15, %xmm2 - addpd dA08+__svml_datan2_data_internal(%rip), %xmm2 - mulpd %xmm15, %xmm2 - addpd dA04+__svml_datan2_data_internal(%rip), %xmm2 - -/* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */ - mulpd %xmm2, %xmm15 - addpd %xmm12, %xmm15 - addpd %xmm14, %xmm15 + +/* P = A19*R2 + A18 */ + movups dA19+__svml_datan2_data_internal(%rip), %xmm15 + movaps %xmm11, %xmm2 + mulpd %xmm12, %xmm15 + addpd dA18+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A17 */ + mulpd %xmm12, %xmm15 + addpd dA17+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A16 */ + mulpd %xmm12, %xmm15 + addpd dA16+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A15 */ + mulpd %xmm12, %xmm15 + addpd dA15+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A14 */ + mulpd %xmm12, %xmm15 + addpd dA14+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A13 */ + mulpd %xmm12, %xmm15 + addpd dA13+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A12 */ + mulpd %xmm12, %xmm15 + addpd dA12+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A11 */ + mulpd %xmm12, %xmm15 + addpd dA11+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A10 */ + mulpd %xmm12, %xmm15 + addpd dA10+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A09 */ + mulpd %xmm12, %xmm15 + addpd dA09+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A08 */ + mulpd %xmm12, %xmm15 + addpd dA08+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A07 */ + mulpd %xmm12, %xmm15 + addpd dA07+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A06 */ + mulpd %xmm12, %xmm15 + addpd dA06+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A05 */ + mulpd %xmm12, %xmm15 + addpd dA05+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A04 */ + mulpd %xmm12, %xmm15 + addpd dA04+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A03 */ + mulpd %xmm12, %xmm15 + addpd dA03+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A02 */ + mulpd %xmm12, %xmm15 + addpd dA02+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 + A01 */ + mulpd %xmm12, %xmm15 + addpd dA01+__svml_datan2_data_internal(%rip), %xmm15 + +/* P = P*R2 */ + mulpd %xmm15, %xmm12 /* * Reconstruction. * dP=(R+R*dP) + dPIO2 */ - mulpd %xmm0, %xmm15 - addpd %xmm15, %xmm0 - addpd %xmm5, %xmm0 - andps __svml_datan2_data_internal(%rip), %xmm3 + mulpd %xmm0, %xmm12 + addpd %xmm12, %xmm0 + +/* if x<0, dPI = Pi, else dPI =0 */ + movups dZERO+__svml_datan2_data_internal(%rip), %xmm3 + por %xmm13, %xmm14 + cmplepd %xmm3, %xmm2 + addpd %xmm6, %xmm0 + andps __svml_datan2_data_internal(%rip), %xmm2 + orps %xmm8, %xmm0 + addpd %xmm2, %xmm0 + por %xmm14, %xmm4 orps %xmm7, %xmm0 - addpd %xmm3, %xmm0 + movmskps %xmm4, %ecx /* Special branch for fast (vector) processing of zero arguments */ - movups 16(%rsp), %xmm11 - orps %xmm6, %xmm0 testb $3, %cl /* Go to auxilary branch */ jne L(AUX_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 /* Return from auxilary branch * for out of main path inputs @@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN): /* Go to special inputs processing branch */ jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9 + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11 /* Restore registers * and exit the function @@ -237,8 +264,8 @@ L(EXIT): */ L(SPECIAL_VALUES_BRANCH): - movups %xmm8, 32(%rsp) - movups %xmm9, 48(%rsp) + movups %xmm10, 32(%rsp) + movups %xmm11, 48(%rsp) movups %xmm0, 64(%rsp) # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 @@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL): */ L(AUX_BRANCH): -/* Check if at least on of Y or Y is zero: iAXAYZERO */ - movups dZERO+__svml_datan2_data_internal(%rip), %xmm2 - /* Check if both X & Y are not NaNs: iXYnotNAN */ - movaps %xmm9, %xmm12 - movaps %xmm8, %xmm10 - cmpordpd %xmm9, %xmm12 - cmpordpd %xmm8, %xmm10 - cmpeqpd %xmm2, %xmm1 - cmpeqpd %xmm2, %xmm11 - andps %xmm10, %xmm12 - orps %xmm11, %xmm1 - pshufd $221, %xmm1, %xmm1 - pshufd $221, %xmm12, %xmm11 + movaps %xmm11, %xmm13 + movaps %xmm10, %xmm12 + cmpordpd %xmm11, %xmm13 + cmpordpd %xmm10, %xmm12 -/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */ - pand %xmm11, %xmm1 - -/* Exclude from previous callout mask zero (and not NaN) arguments */ - movdqa %xmm1, %xmm13 - pandn %xmm4, %xmm13 +/* Check if at least on of Y or Y is zero: iAXAYZERO */ + cmpeqpd %xmm3, %xmm9 + cmpeqpd %xmm3, %xmm1 /* * Path for zero arguments (at least one of both) * Check if both args are zeros (den. is zero) */ - movups (%rsp), %xmm4 - cmpeqpd %xmm2, %xmm4 + cmpeqpd %xmm3, %xmm5 + andps %xmm12, %xmm13 + orps %xmm1, %xmm9 + pshufd $221, %xmm9, %xmm1 + pshufd $221, %xmm13, %xmm9 -/* Go to callout */ - movmskps %xmm13, %edx +/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */ + pand %xmm9, %xmm1 + +/* Exclude from previous callout mask zero (and not NaN) arguments */ + movdqa %xmm1, %xmm14 + pandn %xmm4, %xmm14 /* Set sPIO2 to zero if den. is zero */ - movaps %xmm4, %xmm15 - andps %xmm2, %xmm4 - andnps %xmm5, %xmm15 - andl $3, %edx - orps %xmm4, %xmm15 - pshufd $221, %xmm9, %xmm5 - orps %xmm7, %xmm15 + movaps %xmm5, %xmm4 + andnps %xmm6, %xmm4 + andps %xmm3, %xmm5 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */ - pshufd $221, %xmm2, %xmm7 - pcmpgtd %xmm5, %xmm7 - pshufd $80, %xmm7, %xmm14 - andps %xmm3, %xmm14 - addpd %xmm14, %xmm15 + pshufd $221, %xmm3, %xmm3 + orps %xmm5, %xmm4 + pshufd $221, %xmm11, %xmm5 + orps %xmm8, %xmm4 + pcmpgtd %xmm5, %xmm3 + pshufd $80, %xmm3, %xmm6 + andps %xmm2, %xmm6 + addpd %xmm6, %xmm4 + +/* Go to callout */ + movmskps %xmm14, %edx /* Merge results from main and spec path */ - pshufd $80, %xmm1, %xmm3 - orps %xmm6, %xmm15 - movdqa %xmm3, %xmm6 - andps %xmm3, %xmm15 - andnps %xmm0, %xmm6 - movaps %xmm6, %xmm0 - orps %xmm15, %xmm0 + pshufd $80, %xmm1, %xmm2 + orps %xmm7, %xmm4 + movdqa %xmm2, %xmm7 + andps %xmm2, %xmm4 + andnps %xmm0, %xmm7 + andl $3, %edx + movaps %xmm7, %xmm0 + orps %xmm4, %xmm0 /* Return to main vector processing path */ jmp L(AUX_BRANCH_RETURN) - # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9 + # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11 END(_ZGVbN2vv_atan2_sse4) .section .rodata, "a"