From patchwork Tue Jun 25 17:53:01 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Shawn Landden X-Patchwork-Id: 33396 X-Patchwork-Delegate: tuliom@linux.vnet.ibm.com Received: (qmail 125330 invoked by alias); 25 Jun 2019 17:53:27 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 125320 invoked by uid 89); 25 Jun 2019 17:53:26 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-26.9 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_SHORT, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.1 spammy=1500, Noted, decrease, filled X-HELO: git.icu From: Shawn Landden To: libc-alpha@sourceware.org Cc: Tulio Magno Quites Machado Filho , Joseph Myers , Shawn Landden Subject: [PATCH 1/2] PPC64: Add libmvec SIMD single-precision power function. Date: Tue, 25 Jun 2019 13:53:01 -0400 Message-Id: <20190625175302.26676-1-shawn@git.icu> MIME-Version: 1.0 Based off the ./sysdeps/ieee754/flt-32/powf.c implementation, and thus provides identical results. Unlike other libmvec functions, this sets the underflow and overflow bits. The caller can check these flags, and possibly re-run the calculations with scalar powf to figure out what is causing the overflow or underflow. I may have not normalized the data for benchmarking this properly, but operating only on floats between 0.5 and 1 I get the following: Running 20 times over 32MiB vector: mean 307.659767 (sd 0.203217) scalar: mean 221.837088 (sd 0.032256) And with random data there is a decrease in performance: vector: mean 265.366371 (sd 0.000626) scalar: mean 279.598078 (sd 0.025592) 2019-05-11 Shawn Landden [BZ #24210] * NEWS: Noted the addition of PPC64 vector powf function. * sysdeps/powerpc/bits/math-vector.h: Added entry for vector powf. * sysdeps/powerpc/powerpc64/fpu/Versions: Added vector powf entry. * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile: (libmvec-sysdep_routines, CFLAGS-vec_s_powf4_vsx.c): (CFLAGS-s_powf_log2_data.c): Added build of VSX SIMD powf function and tests. * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c: expanded to include all floating point exceptions. * sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c: Added entry for vector powf. * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c: New file. * sysdeps/powerpc/powerpc64/fpu/multiarch/s_powf_log2_data.c: Likewise. * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: SIMD powf added. --- NEWS | 1 + sysdeps/powerpc/bits/math-vector.h | 2 + sysdeps/powerpc/powerpc64/fpu/Versions | 2 +- .../powerpc/powerpc64/fpu/multiarch/Makefile | 4 +- .../fpu/multiarch/s_powf_log2_data.c | 45 +++ .../fpu/multiarch/test-float-vlen4-wrappers.c | 1 + .../powerpc64/fpu/multiarch/vec_math_errf.c | 13 + .../powerpc64/fpu/multiarch/vec_s_powf4_vsx.c | 311 ++++++++++++++++++ .../linux/powerpc/powerpc64/libmvec.abilist | 1 + 9 files changed, 378 insertions(+), 2 deletions(-) create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/s_powf_log2_data.c create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c diff --git a/NEWS b/NEWS index a33af8c316..e8910d6224 100644 --- a/NEWS +++ b/NEWS @@ -22,6 +22,7 @@ Major new features: - single-precision sincos: sincosf - double-precision exp: exp - single-precision exp: expf + - single-precision pow: powf GCC support for auto-vectorization of functions on PPC64 is not yet available. Until that is done, the new vector math functions are diff --git a/sysdeps/powerpc/bits/math-vector.h b/sysdeps/powerpc/bits/math-vector.h index e7846b4bfa..5709efbae0 100644 --- a/sysdeps/powerpc/bits/math-vector.h +++ b/sysdeps/powerpc/bits/math-vector.h @@ -48,6 +48,8 @@ # define __DECL_SIMD_expf __DECL_SIMD_PPC64 # undef __DECL_SIMD_exp # define __DECL_SIMD_exp __DECL_SIMD_PPC64 +# undef __DECL_SIMD_powf +# define __DECL_SIMD_powf __DECL_SIMD_PPC64 # endif #endif diff --git a/sysdeps/powerpc/powerpc64/fpu/Versions b/sysdeps/powerpc/powerpc64/fpu/Versions index 361edee9e7..225f7c8475 100644 --- a/sysdeps/powerpc/powerpc64/fpu/Versions +++ b/sysdeps/powerpc/powerpc64/fpu/Versions @@ -2,6 +2,6 @@ libmvec { GLIBC_2.30 { _ZGVbN2v_cos; _ZGVbN4v_cosf; _ZGVbN2v_sin; _ZGVbN4v_sinf; _ZGVbN2v_log; _ZGVbN4v_logf; _ZGVbN2vvv_sincos; _ZGVbN4vvv_sincosf; - _ZGVbN4v_expf; _ZGVbN2v_exp; + _ZGVbN4v_expf; _ZGVbN2v_exp; _ZGVbN4vv_powf; } } diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile index dd982bb068..25d29b9a4a 100644 --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile @@ -49,6 +49,7 @@ libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx \ vec_d_log2_vsx vec_d_log_data \ vec_s_logf4_vsx vec_s_logf_data \ vec_s_expf4_vsx vec_s_exp2f_data \ + vec_s_powf4_vsx s_powf_log2_data \ vec_math_errf vec_math_err \ vec_d_exp2_vsx vec_d_exp_data \ vec_d_sincos2_vsx vec_s_sincosf4_vsx @@ -67,6 +68,7 @@ CFLAGS-vec_s_expf4_vsx.c += -mabi=altivec -maltivec -mvsx CFLAGS-vec_s_exp2f_data.c += -mabi=altivec -maltivec -mvsx CFLAGS-vec_d_exp2_vsx.c += -mabi=altivec -maltivec -mvsx -mpower8-vector CFLAGS-vec_d_exp_data.c += -mabi=altivec -maltivec -mvsx +CFLAGS-vec_s_powf4_vsx.c += -mabi=altivec -maltivec -mvsx CFLAGS-vec_math_err.c += -mabi=altivec -maltivec -mvsx endif @@ -76,7 +78,7 @@ ifeq ($(build-mathvec),yes) libmvec-tests += double-vlen2 float-vlen4 double-vlen2-funcs = cos sin sincos log exp -float-vlen4-funcs = cos sin sincos log exp +float-vlen4-funcs = cos sin sincos log exp pow double-vlen2-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX float-vlen4-arch-ext-cflags = -mabi=altivec -maltivec -mvsx -DREQUIRE_VSX diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_powf_log2_data.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_powf_log2_data.c new file mode 100644 index 0000000000..542cde0fb1 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_powf_log2_data.c @@ -0,0 +1,45 @@ +/* Data definition for powf. + Copyright (C) 2017-2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ +/* This file is identical to sysdeps/ieee754/flt-32/e_powf_log2_data.c */ +#include "math_config_flt.h" + +const struct powf_log2_data __powf_log2_data = { + .tab = { + { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * POWF_SCALE }, + { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * POWF_SCALE }, + { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * POWF_SCALE }, + { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * POWF_SCALE }, + { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * POWF_SCALE }, + { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * POWF_SCALE }, + { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * POWF_SCALE }, + { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * POWF_SCALE }, + { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * POWF_SCALE }, + { 0x1p+0, 0x0p+0 * POWF_SCALE }, + { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * POWF_SCALE }, + { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * POWF_SCALE }, + { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * POWF_SCALE }, + { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * POWF_SCALE }, + { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * POWF_SCALE }, + { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * POWF_SCALE }, + }, + .poly = { + 0x1.27616c9496e0bp-2 * POWF_SCALE, -0x1.71969a075c67ap-2 * POWF_SCALE, + 0x1.ec70a6ca7baddp-2 * POWF_SCALE, -0x1.7154748bef6c8p-1 * POWF_SCALE, + 0x1.71547652ab82bp0 * POWF_SCALE, + } +}; diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c index 37886d1a45..d98c11651e 100644 --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c @@ -25,4 +25,5 @@ VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf) VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf) +VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf) VECTOR_WRAPPER_fFF (WRAPPER_NAME (sincosf), _ZGVbN4vvv_sincosf) diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c index 1d145ffde1..3d655d7561 100644 --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_math_errf.c @@ -36,3 +36,16 @@ __math_oflowf (uint32_t sign) { return xflowf (sign, 0x1p97f); } + +attribute_hidden float +__math_invalidf (float x) +{ + return (x - x) / (x - x); +} + +attribute_hidden float +__math_divzerof (uint32_t sign) +{ + return (float)(sign ? -1.0f : 1.0f) / 0.0f; +} + diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c new file mode 100644 index 0000000000..1ed9e1b450 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_powf4_vsx.c @@ -0,0 +1,311 @@ +/* Single-precision vector pow function. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include "math_config_flt.h" + +typedef vector long long unsigned v64u; +/* +POWF_LOG2_POLY_ORDER = 5 +EXP2F_TABLE_BITS = 5 + +ULP error: 0.82 (~ 0.5 + relerr*2^24) +relerr: 1.27 * 2^-26 (Relative error ~= 128*Ln2*relerr_log2 + relerr_exp2) +relerr_log2: 1.83 * 2^-33 (Relative error of logx.) +relerr_exp2: 1.69 * 2^-34 (Relative error of exp2(ylogx).) +*/ + +#define N (1 << POWF_LOG2_TABLE_BITS) +#define T __powf_log2_data.tab +#define A __powf_log2_data.poly +#define OFF 0x3f330000 + + +/* Subnormal input is normalized so ix has negative biased exponent. + Output is multiplied by N (POWF_SCALE) if TOINT_INTRINICS is set. */ +struct two_v_doubles { + vector double l; + vector double r; +}; + +static inline struct two_v_doubles log2_inline(vector unsigned ix) +{ + vector float z; + vector double rl, rr, r2l, r2r, r4l, r4r, pl, pr, ql, qr, yl, yr, y0l, y0r; + vector unsigned iz, top, tmp; + vector signed k, i; + struct two_v_doubles ret; + + /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (vector signed)(tmp >> (23 - POWF_LOG2_TABLE_BITS)) % N; + top = tmp & 0xff800000; + iz = ix - top; + k = (vector signed)top >> (23 - POWF_SCALE_BITS); /* arithmetic shift */ + vector double invcl = {T[i[0]].invc, T[i[1]].invc}; + vector double invcr = {T[i[2]].invc, T[i[3]].invc}; + vector double logcl = {T[i[0]].logc, T[i[1]].logc}; + vector double logcr = {T[i[2]].logc, T[i[3]].logc}; + z = vasfloat(iz); + vector double zl = {z[0], z[1]}; + vector double zr = {z[2], z[3]}; + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ + rl = zl * invcl - 1; + rr = zr * invcr - 1; + vector double kl = {(double)k[0], (double)k[1]}; + vector double kr = {(double)k[2], (double)k[3]}; + y0l = logcl + kl; + y0r = logcr + kr; + + /* Pipelined polynomial evaluation to approximate log1p(r)/ln2. */ + r2l = rl * rl; + r2r = rr * rr; + yl = A[0] * rl + A[1]; + yr = A[0] * rr + A[1]; + pl = A[2] * rl + A[3]; + pr = A[2] * rr + A[3]; + r4l = r2l * r2l; + r4r = r2r * r2r; + ql = A[4] * rl + y0l; + qr = A[4] * rr + y0r; + ql = pl * r2l + ql; + qr = pr * r2r + qr; + yl = yl * r4l + ql; + yr = yr * r4r + qr; + ret.l = yl; + ret.r = yr; + return ret; +} + +#undef N +#undef T +#define N (1 << EXP2F_TABLE_BITS) +#define T __exp2f_data.tab +#define SIGN_BIAS (1 << (EXP2F_TABLE_BITS + 11)) + +/* The output of log2 and thus the input of exp2 is either scaled by N + (in case of fast toint intrinsics) or not. The unscaled xd must be + in [-1021,1023], sign_bias sets the sign of the result. */ +static inline vector float exp2_inline (vector double xdl, vector double xdr, vector unsigned sign_bias) +{ + v64u kil, kir, skil, skir, sign_biasl, sign_biasr; + vector double kdl, kdr, zl, zr, rl, rr, r2l, r2r, yl, yr, sl, sr; + + vector unsigned zero = {0, 0, 0, 0}; + // TODO check if these need to be reversed on big-endian + sign_biasl = (v64u)vec_mergeh (sign_bias, zero); + sign_biasr = (v64u)vec_mergel (sign_bias, zero); +#define C __exp2f_data.poly +#define SHIFT __exp2f_data.shift_scaled + /* x = k/N + r with r in [-1/(2N), 1/(2N)] */ + kdl = xdl + SHIFT; + kdr = xdr + SHIFT; + kil = vasuint64 (kdl); + kir = vasuint64 (kdr); + kdl -= SHIFT; + kdr -= SHIFT; /* k/N */ + rl = xdl - kdl; + rr = xdr - kdr; + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ + v64u tl = {T[kil[0] % N], T[kil[1] % N]}; + v64u tr = {T[kir[0] % N], T[kir[1] % N]}; + skil = kil + sign_biasl; + skir = kir + sign_biasr; + tl += skil << (52 - EXP2F_TABLE_BITS); + tr += skir << (52 - EXP2F_TABLE_BITS); + sl = vasdouble(tl); + sr = vasdouble(tr); + zl = C[0] * rl + C[1]; + zr = C[0] * rr + C[1]; + r2l = rl * rl; + r2r = rr * rr; + yl = C[2] * rl + 1; + yr = C[2] * rr + 1; + yl = zl * r2l + yl; + yr = zr * r2r + yr; + yl = yl * sl; + yr = yr * sr; + /* There is no vector pack/unpack for 64<->32 */ + vector float res = {(float)yl[0], (float)yl[1], (float)yr[0], (float)yr[1]}; + return res; +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline vector unsigned checkint(vector unsigned iy) +{ + vector unsigned e = iy >> 23 & 0xff; + vector unsigned zero = {0, 0, 0, 0}; + vector unsigned not_matched = ~zero; + vector unsigned res; + vector unsigned is_first = (vector unsigned)vec_cmplt (e, zero + 0x7f); + not_matched &= ~is_first; + res = zero; + vector unsigned is_second = (vector unsigned)vec_cmpgt (e, zero + 0x7f + 23); + not_matched &= ~is_second; + res = vec_sel (res, zero + 2, is_second); + vector unsigned is_third = (vector unsigned)vec_cmpne (iy & ((1 << (0x7f + 23 - e)) - 1), zero); + res = vec_sel (res, zero, is_third & not_matched); + not_matched &= ~is_third; + vector unsigned is_four = (vector unsigned)vec_cmpne (iy & (1 << (0x7f + 23 - e)), zero); + res = vec_sel (res, zero + 1, is_four & not_matched); + not_matched &= ~is_four; + res = vec_sel (res, zero + 2, not_matched); + return res; +} + +static +vector unsigned +zeroinfnan(vector unsigned ix) +{ + vector unsigned zero = {0, 0, 0, 0}; + return (vector unsigned)vec_cmpge (2 * ix - 1, zero + (2u * 0x7f800000 - 1)); +} + +vector float +_ZGVbN4vv_powf(vector float x, vector float y) +{ + vector unsigned zero = {0, 0, 0, 0}; + vector unsigned sign_bias = zero; + vector unsigned ix, iy, res = zero, res_mask = zero; + + ix = vasuint (x); + iy = vasuint (y); + vector unsigned special_cst = {0x7f800000 - 0x00800000, 0x7f800000 - 0x00800000, + 0x7f800000 - 0x00800000, 0x7f800000 - 0x00800000}; + vector unsigned is_special = (vector unsigned) vec_cmpge (ix - 0x00800000, special_cst); + vector unsigned is_zeroinfnanx = zeroinfnan(iy); + if (__glibc_unlikely (!vec_all_eq (is_special | is_zeroinfnanx, zero))) + { + if (!vec_all_eq(is_zeroinfnanx, zero)) + { + vector unsigned not_covered = is_zeroinfnanx; + res_mask = is_zeroinfnanx; + vector unsigned is_one = (vector unsigned)vec_cmpeq (2 * iy, zero); + vector float one = {1.0f, 1.0f, 1.0f, 1.0f}; + vector unsigned is_two = (vector unsigned)vec_cmpeq (ix, zero + 0x3f800000); + res = vec_sel (res, vasuint (one), (is_one | is_two) & not_covered); + not_covered &= ~(is_one | is_two); + vector unsigned is_threea = (vector unsigned)vec_cmpgt (2 * ix, zero + 2u * 0x7f800000); + vector unsigned is_threeb = (vector unsigned)vec_cmpgt (2 * iy, zero + 2u * 0x7f800000); + vector float xy = x + y; + res = vec_sel (res, vasuint (xy), (is_threea | is_threeb) & not_covered); + not_covered &= ~(is_threea | is_threeb); + vector unsigned is_four = (vector unsigned)vec_cmpeq (2 * ix, zero + 2 * 0x3f800000); + res = vec_sel (res, vasuint (one), is_four & not_covered); + not_covered &= ~is_four; + vector unsigned is_fivea = (vector unsigned)vec_cmplt (2 * ix, zero + 2 * 0x3f800000); + vector unsigned is_fiveb = (vector unsigned)vec_cmplt (iy, zero + 0x80000000); + vector unsigned is_five = (vector unsigned)vec_cmpeq (is_fivea, is_fiveb); + res = vec_sel (res, zero, is_five & not_covered); + not_covered &= ~is_five; + vector float yy = y * y; + res = vec_sel (res, vasuint (yy), not_covered); + } + vector unsigned is_ix = (vector unsigned) vec_cmpge (ix, zero + 0x80000000); + vector unsigned is_xinfnan = zeroinfnan(ix); + if (!vec_all_eq (is_xinfnan & ~res_mask, zero)) + { + vector float x2 = x * x; + vector unsigned is_checkinty = (vector unsigned) vec_cmpeq (checkint(iy), zero + 1); + if (!vec_all_eq (is_ix & is_checkinty, zero)) + x2 = vec_sel (x2, -x2, is_ix & is_checkinty); + vector unsigned is_iy = (vector unsigned) vec_cmpge (iy, zero + 0x80000000); + if (!vec_all_eq (is_iy, zero)) + { + math_force_eval (1 / x2); + x2 = vec_sel (x2, 1 / x2, is_iy); + } + res = vec_sel (res, vasuint(x2), is_xinfnan); + res_mask |= is_xinfnan; + } + vector unsigned is_xneg = (vector unsigned) vec_cmpgt (ix, zero + 0x80000000); + if (!vec_all_eq(is_xneg, zero)) + { + vector unsigned yint = checkint (iy); + vector unsigned is_invalid = (vector unsigned) vec_cmpeq (yint, zero); + if (!vec_all_eq(is_invalid & ~res_mask, zero)) + { + for (int m=0;m<4;m++) + { + if ((is_invalid & ~res_mask)[m] == 0) + continue; + res[m] = asuint (__math_invalidf (x[m])); + res_mask[m] = 0xffffffff; + } + } + vector unsigned is_one = (vector unsigned)vec_cmpeq (yint, zero + 1); + sign_bias = vec_sel (sign_bias, zero + SIGN_BIAS, is_one); + ix = vec_sel (ix, ix & 0x7fffffff, is_xneg); + } + vector unsigned is_subnormal = (vector unsigned) vec_cmplt (ix, zero + 0x00800000); + if (!vec_all_eq (is_subnormal & ~res_mask, zero)) + { + vector unsigned subnormals = ix; + subnormals = vasuint (vasfloat(ix) * 0x1p23f); + subnormals = subnormals & (unsigned)0x7ffffffff; + subnormals -= 23 << 23; + ix = vec_sel (ix, subnormals, is_subnormal & ~res_mask); + } + /* If we already filled in all the results, we can return early. */ + if (vec_all_eq(res_mask, ~zero)) + return vasfloat(res); + } + struct two_v_doubles logx = log2_inline(ix); + + vector double yl = {y[0], y[1]}; + vector double yr = {y[2], y[3]}; + vector double ylogxl = yl * logx.l; + vector double ylogxr = yr * logx.r; + v64u overunderflow_const = {asuint64(126.0 * POWF_SCALE) >> 47, + asuint64(126.0 * POWF_SCALE) >> 47}; + v64u is_overunderflowl = (v64u) vec_cmpge + (((vasuint64 (ylogxl) >> 47) & 0xffff), overunderflow_const); + v64u is_overunderflowr = (v64u) vec_cmpge + (((vasuint64(ylogxr) >> 47) & 0xffff), overunderflow_const); + vector unsigned is_overunderflow = vec_pack (is_overunderflowl, is_overunderflowr); + if (__glibc_unlikely (!vec_all_eq (is_overunderflow, zero))) + { + vector double ylogx = ylogxl; + for (int m=0;m<4;m++) + { + if (is_overunderflow[m] == 0) + continue; + if (m == 2 || m == 3) + ylogx = ylogxr; + if (ylogx[m % 2] > (0x1.fffffffd1d571p+6 * POWF_SCALE)) + { + res[m] = asuint (__math_oflowf (sign_bias[m])); + res_mask[m] = 0xffffffff; + } + else if (ylogx[m % 2] <= (-150.0 * POWF_SCALE)) + { + res[m] = asuint (__math_uflowf (sign_bias[m])); + res_mask[m] = 0xffffffff; + } + } + } + vector unsigned exp2 = vasuint (exp2_inline (ylogxl, ylogxr, sign_bias)); + return vasfloat (vec_sel (exp2, res, res_mask)); +} diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist index 46558dbd77..eedc6c84de 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist @@ -7,4 +7,5 @@ GLIBC_2.30 _ZGVbN4v_cosf F GLIBC_2.30 _ZGVbN4v_expf F GLIBC_2.30 _ZGVbN4v_logf F GLIBC_2.30 _ZGVbN4v_sinf F +GLIBC_2.30 _ZGVbN4vv_powf F GLIBC_2.30 _ZGVbN4vvv_sincosf F