From patchwork Fri Nov 21 16:58:26 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Andrew Senkevich X-Patchwork-Id: 3835 Received: (qmail 3800 invoked by alias); 21 Nov 2014 16:59:09 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 3789 invoked by uid 89); 21 Nov 2014 16:59:08 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.5 required=5.0 tests=AWL, BAYES_00, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=ham version=3.3.2 X-HELO: mail-lb0-f170.google.com X-Received: by 10.112.170.99 with SMTP id al3mr6301000lbc.17.1416589136875; Fri, 21 Nov 2014 08:58:56 -0800 (PST) MIME-Version: 1.0 From: Andrew Senkevich Date: Fri, 21 Nov 2014 19:58:26 +0300 Message-ID: Subject: [PATCH 4/N] [x86_64] Vectorized math functions To: libc-alpha Hi, this patch adds vector implementations of cos in SSE4, AVX and AVX2 ISAs. ChangeLog 2014-11-21 Andrew Senkevich * sysdeps/x86_64/fpu/svml_d_cos2_core.S: New file. * sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S: New file. * sysdeps/x86_64/fpu/svml_d_cos2_core_avx2.S: New file. * sysdeps/x86_64/fpu/svml_d_cos_data.S: New file. Copy attached. --- WBR, Andrew diff --git a/sysdeps/x86_64/fpu/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/svml_d_cos2_core.S new file mode 100644 index 0000000..47288c2 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos2_core.S @@ -0,0 +1,210 @@ +/* Function cos vectorized with SSE4. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define _DATA_TABLE_OFFSETS_ONLY_ +#include "svml_d_cos_data.S" + + .text +ENTRY(_ZGVbN2v_cos) + +/* ALGORITHM DESCRIPTION: + * + * ( low accuracy ( < 4ulp ) or enhanced performance ( half of correct mantissa ) implementation ) + * + * Argument representation: + * arg + Pi/2 = (N*Pi + R) + * + * Result calculation: + * cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + * sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + movq %rsp, %rbp + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm3 + movq __svml_dcos_data@GOTPCREL(%rip), %rax + movups __dHalfPI(%rax), %xmm2 + +/* ARGUMENT RANGE REDUCTION: + * Add Pi/2 to argument: X' = X+Pi/2 + */ + addpd %xmm3, %xmm2 + movups __dInvPI(%rax), %xmm5 + movups __dAbsMask(%rax), %xmm4 + +/* Get absolute argument value: X' = |X'| */ + andps %xmm2, %xmm4 + +/* Y = X'*InvPi + RS : right shifter add */ + mulpd %xmm5, %xmm2 + +/* Check for large arguments path */ + cmpnlepd __dRangeVal(%rax), %xmm4 + movups __dRShifter(%rax), %xmm6 + addpd %xmm6, %xmm2 + movmskpd %xmm4, %ecx + +/* N = Y - RS : right shifter sub */ + movaps %xmm2, %xmm1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + psllq $63, %xmm2 + subpd %xmm6, %xmm1 + +/* N = N - 0.5 */ + subpd __dOneHalf(%rax), %xmm1 + movups __dPI1(%rax), %xmm7 + +/* R = X - N*Pi1 */ + mulpd %xmm1, %xmm7 + movups __dPI2(%rax), %xmm4 + +/* R = R - N*Pi2 */ + mulpd %xmm1, %xmm4 + subpd %xmm7, %xmm0 + movups __dPI3(%rax), %xmm5 + +/* R = R - N*Pi3 */ + mulpd %xmm1, %xmm5 + subpd %xmm4, %xmm0 + +/* R = R - N*Pi4 */ + movups __dPI4(%rax), %xmm6 + mulpd %xmm6, %xmm1 + subpd %xmm5, %xmm0 + subpd %xmm1, %xmm0 + +/* POLYNOMIAL APPROXIMATION: + * R2 = R*R + */ + movaps %xmm0, %xmm4 + mulpd %xmm0, %xmm4 + movups __dC7(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC6(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC5(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC4(%rax), %xmm1 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + mulpd %xmm4, %xmm1 + addpd __dC3(%rax), %xmm1 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + mulpd %xmm4, %xmm1 + addpd __dC2(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC1(%rax), %xmm1 + mulpd %xmm1, %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm4, %xmm0 + +/* RECONSTRUCTION: + * Final sign setting: Res = Poly^SignRes + */ + xorps %xmm2, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + movq %rbp, %rsp + popq %rbp + ret + +.LBL_1_3: + movups %xmm3, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + movb %dl, %r12b + movq %r13, 160(%rsp) + movl %ecx, %r13d + movq %r14, 152(%rsp) + movl %eax, %r14d + movq %r15, 144(%rsp) + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + movq 160(%rsp), %r13 + movq 152(%rsp), %r14 + movq 144(%rsp), %r15 + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call cos@PLT + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call cos@PLT + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END(_ZGVbN2v_cos) diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S new file mode 100644 index 0000000..24b4f75 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S @@ -0,0 +1,39 @@ +/* Function cos vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + + .text +ENTRY(_ZGVcN4v_cos) + pushq %rbp + movq %rsp, %rbp + andq $-32, %rsp + subq $32, %rsp + vextractf128 $1, %ymm0, (%rsp) + vzeroupper + call _ZGVbN2v_cos@PLT + vmovapd %xmm0, 16(%rsp) + vmovaps (%rsp), %xmm0 + call _ZGVbN2v_cos@PLT + vmovapd %xmm0, %xmm1 + vmovapd 16(%rsp), %xmm0 + vinsertf128 $1, %xmm1, %ymm0, %ymm0 + movq %rbp, %rsp + popq %rbp + ret +END(_ZGVcN4v_cos) diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core_avx2.S b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx2.S new file mode 100644 index 0000000..95db6b3 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx2.S @@ -0,0 +1,195 @@ +/* Function cos vectorized with AVX2. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#define _DATA_TABLE_OFFSETS_ONLY_ +#include "svml_d_cos_data.S" + + .text +ENTRY(_ZGVdN4v_cos) + +/* ALGORITHM DESCRIPTION: + * + * ( low accuracy ( < 4ulp ) or enhanced performance + * ( half of correct mantissa ) implementation ) + * + * Argument representation: + * arg + Pi/2 = (N*Pi + R) + * + * Result calculation: + * cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + * sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + movq %rsp, %rbp + andq $-64, %rsp + subq $448, %rsp + movq __svml_dcos_data@GOTPCREL(%rip), %rax + vmovapd %ymm0, %ymm1 + vmovupd __dInvPI(%rax), %ymm4 + vmovupd __dRShifter(%rax), %ymm5 + +/* + * ARGUMENT RANGE REDUCTION: + * Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %ymm1, %ymm7 + +/* Get absolute argument value: X' = |X'| */ + vandpd __dAbsMask(%rax), %ymm7, %ymm2 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %ymm5, %ymm4, %ymm7 + vmovupd __dC7(%rax), %ymm4 + +/* Check for large arguments path */ + vcmpnle_uqpd __dRangeVal(%rax), %ymm2, %ymm3 + +/* N = Y - RS : right shifter sub */ + vsubpd %ymm5, %ymm7, %ymm6 + vmovupd __dPI1_FMA(%rax), %ymm2 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm7, %ymm7 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %ymm6, %ymm0 + vmovmskpd %ymm3, %ecx + +/* R = X - N*Pi1 */ + vmovapd %ymm1, %ymm3 + vfnmadd231pd %ymm0, %ymm2, %ymm3 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm0, %ymm3 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %ymm3, %ymm0 + +/* + * POLYNOMIAL APPROXIMATION: + * R2 = R*R + */ + vmulpd %ymm0, %ymm0, %ymm5 + vfmadd213pd __dC6(%rax), %ymm5, %ymm4 + vfmadd213pd __dC5(%rax), %ymm5, %ymm4 + vfmadd213pd __dC4(%rax), %ymm5, %ymm4 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm5, %ymm4 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %ymm5, %ymm4 + vfmadd213pd __dC1(%rax), %ymm5, %ymm4 + vmulpd %ymm5, %ymm4, %ymm6 + vfmadd213pd %ymm0, %ymm0, %ymm6 + +/* + * RECONSTRUCTION: + * Final sign setting: Res = Poly^SignRes + */ + vxorpd %ymm7, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + movq %rbp, %rsp + popq %rbp + ret + +.LBL_1_3: + vmovupd %ymm1, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + movb %dl, %r12b + movq %r13, 288(%rsp) + movl %ecx, %r13d + movq %r14, 280(%rsp) + movl %eax, %r14d + movq %r15, 272(%rsp) + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + movq 288(%rsp), %r13 + movq 280(%rsp), %r14 + movq 272(%rsp), %r15 + jmp .LBL_1_2 + +.LBL_1_10: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call cos@PLT + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call cos@PLT + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END(_ZGVdN4v_cos) diff --git a/sysdeps/x86_64/fpu/svml_d_cos_data.S b/sysdeps/x86_64/fpu/svml_d_cos_data.S new file mode 100644 index 0000000..4e9f36b --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos_data.S @@ -0,0 +1,147 @@ +/* Data for vectorized cos. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef D_COS_DATA +#define D_COS_DATA + +/* Offsets for data table + */ +#define __dAbsMask 0 +#define __dRangeVal 64 +#define __dHalfPI 128 +#define __dInvPI 192 +#define __dRShifter 256 +#define __dOneHalf 320 +#define __dPI1 384 +#define __dPI2 448 +#define __dPI3 512 +#define __dPI4 576 +#define __dPI1_FMA 640 +#define __dPI2_FMA 704 +#define __dPI3_FMA 768 +#define __dC1 832 +#define __dC2 896 +#define __dC3 960 +#define __dC4 1024 +#define __dC5 1088 +#define __dC6 1152 +#define __dC7 1216 +#define __dAbsMask_la 1280 +#define __dInvPI_la 1344 +#define __dRShifter_la 1408 +#define __dRShifterm5_la 1472 +#define __dRXmax_la 1536 + +#ifndef _DATA_TABLE_OFFSETS_ONLY_ + +.macro double_vector offset value +.if .-__svml_dcos_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function cos. + * The table may contain polynomial, reduction, lookup + * coefficients and other constants obtained through different + * methods of research and experimental work. + */ + .globl __svml_dcos_data +__svml_dcos_data: + +/* General purpose constants: + * absolute value mask + */ +double_vector __dAbsMask 0x7fffffffffffffff + +/* working range threshold */ +double_vector __dRangeVal 0x4160000000000000 + +/* PI/2 */ +double_vector __dHalfPI 0x3ff921fb54442d18 + +/* 1/PI */ +double_vector __dInvPI 0x3fd45f306dc9c883 + +/* right-shifter constant */ +double_vector __dRShifter 0x4338000000000000 + +/* 0.5 */ +double_vector __dOneHalf 0x3fe0000000000000 + +/* Range reduction PI-based constants: + * PI high part + */ +double_vector __dPI1 0x400921fb40000000 + +/* PI mid part 1 */ +double_vector __dPI2 0x3e84442d00000000 + +/* PI mid part 2 */ +double_vector __dPI3 0x3d08469880000000 + +/* PI low part */ +double_vector __dPI4 0x3b88cc51701b839a + +/* Range reduction PI-based constants if FMA available: + * PI high part (FMA available) + */ +double_vector __dPI1_FMA 0x400921fb54442d18 + +/* PI mid part (FMA available) */ +double_vector __dPI2_FMA 0x3ca1a62633145c06 + +/* PI low part (FMA available) */ +double_vector __dPI3_FMA 0x395c1cd129024e09 + +/* Polynomial coefficients (relative error 2^(-52.115)): */ +double_vector __dC1 0xbfc55555555554a7 +double_vector __dC2 0x3f8111111110a4a8 +double_vector __dC3 0xbf2a01a019a5b86d +double_vector __dC4 0x3ec71de38030fea0 +double_vector __dC5 0xbe5ae63546002231 +double_vector __dC6 0x3de60e6857a2f220 +double_vector __dC7 0xbd69f0d60811aac8 + +/* + * Additional constants: + * absolute value mask + */ +double_vector __dAbsMask_la 0x7fffffffffffffff + +/* 1/PI */ +double_vector __dInvPI_la 0x3fd45f306dc9c883 + +/* right-shifer for low accuracy version */ +double_vector __dRShifter_la 0x4330000000000000 + +/* right-shifer-1.0 for low accuracy version */ +double_vector __dRShifterm5_la 0x432fffffffffffff + +/* right-shifer with low mask for low accuracy version */ +double_vector __dRXmax_la 0x43300000007ffffe + + .type __svml_dcos_data,@object + .size __svml_dcos_data,.-__svml_dcos_data +#endif +#endif