From patchwork Tue Jun 28 19:27:53 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Tulio Magno Quites Machado Filho X-Patchwork-Id: 13451 Received: (qmail 14259 invoked by alias); 28 Jun 2016 19:29:22 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 14237 invoked by uid 89); 28 Jun 2016 19:29:21 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.1 required=5.0 tests=AWL, BAYES_00, KAM_LAZY_DOMAIN_SECURITY, RCVD_IN_DNSWL_LOW autolearn=no version=3.3.2 spammy=iz, VSX, pS2, ps2 X-HELO: mx0a-001b2d01.pphosted.com X-IBM-Helo: d24dlp01.br.ibm.com X-IBM-MailFrom: tuliom@linux.vnet.ibm.com X-IBM-RcptTo: libc-alpha@sourceware.org From: "Tulio Magno Quites Machado Filho" To: libc-alpha@sourceware.org Cc: munroesj@linux.vnet.ibm.com, joseph@codesourcery.com Subject: [PATCH] Add a new macro to mask a float Date: Tue, 28 Jun 2016 16:27:53 -0300 X-TM-AS-MML: disable X-Content-Scanned: Fidelis XPS MAILER x-cbid: 16062819-0028-0000-0000-0000011CDAB8 X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused x-cbparentid: 16062819-0029-0000-0000-000013C140AF Message-Id: <1467142073-13886-1-git-send-email-tuliom@linux.vnet.ibm.com> X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:, , definitions=2016-06-28_12:, , signatures=0 X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0 spamscore=0 suspectscore=4 malwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam adjust=0 reason=mlx scancount=1 engine=8.0.1-1604210000 definitions=main-1606280176 Defining a new macro allows architectures to provide more efficient implementations than using a GET_FLOAT_WORD/SET_FLOAT_WORD pair. As an example, POWER8 is able to mask the float directly in the VSX without copying the data to a GPR and copying it back. This patch introduces the new macro MASK_FLOAT. The generic implementation remains unchanged. Tested on x86_64, ppc, ppc64, ppc64le and s390x. 2016-06-28 Tulio Magno Quites Machado Filho * sysdeps/generic/math_private.h (MASK_FLOAT): New macro. * sysdeps/ieee754/flt-32/e_acosf.c (__ieee754_acosf): Replace SET_FLOAT_WORD and GET_FLOAT_WORD sequence by MASK_FLOAT. * sysdeps/ieee754/flt-32/e_asinf.c (__ieee754_asinf): Likewise. * sysdeps/ieee754/flt-32/e_powf.c (__ieee754_powf): Likewise. * sysdeps/ieee754/flt-32/k_tanf.c (__kernel_tanf): Likewise. * sysdeps/ieee754/flt-32/s_modff.c (__modff): Likewise. * sysdeps/powerpc/powerpc64/power8/fpu/math_private.h: New file. --- sysdeps/generic/math_private.h | 14 +++++++ sysdeps/ieee754/flt-32/e_acosf.c | 4 +- sysdeps/ieee754/flt-32/e_asinf.c | 4 +- sysdeps/ieee754/flt-32/e_powf.c | 18 +++------ sysdeps/ieee754/flt-32/k_tanf.c | 7 +--- sysdeps/ieee754/flt-32/s_modff.c | 4 +- .../powerpc/powerpc64/power8/fpu/math_private.h | 47 ++++++++++++++++++++++ 7 files changed, 72 insertions(+), 26 deletions(-) create mode 100644 sysdeps/powerpc/powerpc64/power8/fpu/math_private.h diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h index cf1865d..21e272c 100644 --- a/sysdeps/generic/math_private.h +++ b/sysdeps/generic/math_private.h @@ -181,6 +181,20 @@ do { \ } while (0) #endif +/* Apply an integer mask on a float. + + The default implementation invokes the GET_FLOAT_WORD/SET_FLOAT_WORD + macro pair. Note that this macro can only be used to apply an AND + mask supplied directly as a parameter. */ +#ifndef MASK_FLOAT +# define MASK_FLOAT(f,mask) \ +do { \ + u_int32_t __tmp; \ + GET_FLOAT_WORD(__tmp, f); \ + SET_FLOAT_WORD(f, __tmp&mask); \ +} while (0) +#endif + /* Get long double macros from a separate header. */ #include diff --git a/sysdeps/ieee754/flt-32/e_acosf.c b/sysdeps/ieee754/flt-32/e_acosf.c index 6f792f6..8b29e53 100644 --- a/sysdeps/ieee754/flt-32/e_acosf.c +++ b/sysdeps/ieee754/flt-32/e_acosf.c @@ -61,12 +61,10 @@ __ieee754_acosf(float x) w = r*s-pio2_lo; return pi - (float)2.0*(s+w); } else { /* x > 0.5 */ - int32_t idf; z = (one-x)*(float)0.5; s = __ieee754_sqrtf(z); df = s; - GET_FLOAT_WORD(idf,df); - SET_FLOAT_WORD(df,idf&0xfffff000); + MASK_FLOAT(df,0xfffff000); c = (z-df*df)/(s+df); p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5))))); q = one+z*(qS1+z*(qS2+z*(qS3+z*qS4))); diff --git a/sysdeps/ieee754/flt-32/e_asinf.c b/sysdeps/ieee754/flt-32/e_asinf.c index 2ca2dbc..95e0a79 100644 --- a/sysdeps/ieee754/flt-32/e_asinf.c +++ b/sysdeps/ieee754/flt-32/e_asinf.c @@ -89,10 +89,8 @@ float __ieee754_asinf(float x) if(ix>=0x3F79999A) { /* if |x| > 0.975 */ t = pio2_hi-(2.0f*(s+s*p)-pio2_lo); } else { - int32_t iw; w = s; - GET_FLOAT_WORD(iw,w); - SET_FLOAT_WORD(w,iw&0xfffff000); + MASK_FLOAT(w,0xfffff000); c = (t-w*w)/(s+w); r = p; p = 2.0f*s*r-(pio2_lo-2.0f*c); diff --git a/sysdeps/ieee754/flt-32/e_powf.c b/sysdeps/ieee754/flt-32/e_powf.c index c72fe37..d62e877 100644 --- a/sysdeps/ieee754/flt-32/e_powf.c +++ b/sysdeps/ieee754/flt-32/e_powf.c @@ -136,8 +136,7 @@ __ieee754_powf(float x, float y) u = ivln2_h*t; /* ivln2_h has 16 sig. bits */ v = t*ivln2_l-w*ivln2; t1 = u+v; - GET_FLOAT_WORD(is,t1); - SET_FLOAT_WORD(t1,is&0xfffff000); + MASK_FLOAT(t1,0xfffff000); t2 = v-(t1-u); } else { float s2,s_h,s_l,t_h,t_l; @@ -163,8 +162,7 @@ __ieee754_powf(float x, float y) v = one/(ax+bp[k]); s = u*v; s_h = s; - GET_FLOAT_WORD(is,s_h); - SET_FLOAT_WORD(s_h,is&0xfffff000); + MASK_FLOAT(s_h,0xfffff000); /* t_h=ax+bp[k] High */ SET_FLOAT_WORD (t_h, ((((ix>>1)|0x20000000)+0x00400000+(k<<21)) @@ -177,24 +175,21 @@ __ieee754_powf(float x, float y) r += s_l*(s_h+s); s2 = s_h*s_h; t_h = (float)3.0+s2+r; - GET_FLOAT_WORD(is,t_h); - SET_FLOAT_WORD(t_h,is&0xfffff000); + MASK_FLOAT(t_h,0xfffff000); t_l = r-((t_h-(float)3.0)-s2); /* u+v = s*(1+...) */ u = s_h*t_h; v = s_l*t_h+t_l*s; /* 2/(3log2)*(s+...) */ p_h = u+v; - GET_FLOAT_WORD(is,p_h); - SET_FLOAT_WORD(p_h,is&0xfffff000); + MASK_FLOAT(p_h,0xfffff000); p_l = v-(p_h-u); z_h = cp_h*p_h; /* cp_h+cp_l = 2/(3*log2) */ z_l = cp_l*p_h+p_l*cp+dp_l[k]; /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */ t = (float)n; t1 = (((z_h+z_l)+dp_h[k])+t); - GET_FLOAT_WORD(is,t1); - SET_FLOAT_WORD(t1,is&0xfffff000); + MASK_FLOAT(t1,0xfffff000); t2 = z_l-(((t1-t)-dp_h[k])-z_h); } @@ -234,8 +229,7 @@ __ieee754_powf(float x, float y) p_h -= t; } t = p_l+p_h; - GET_FLOAT_WORD(is,t); - SET_FLOAT_WORD(t,is&0xfffff000); + MASK_FLOAT(t,0xfffff000); u = t*lg2_h; v = (p_l-(t-p_h))*lg2+t*lg2_l; z = u+v; diff --git a/sysdeps/ieee754/flt-32/k_tanf.c b/sysdeps/ieee754/flt-32/k_tanf.c index 9f0e558..d805816 100644 --- a/sysdeps/ieee754/flt-32/k_tanf.c +++ b/sysdeps/ieee754/flt-32/k_tanf.c @@ -87,14 +87,11 @@ float __kernel_tanf(float x, float y, int iy) simply return -1.0/(x+r) here */ /* compute -1.0/(x+r) accurately */ float a,t; - int32_t i; z = w; - GET_FLOAT_WORD(i,z); - SET_FLOAT_WORD(z,i&0xfffff000); + MASK_FLOAT(z, 0xfffff000); v = r-(z - x); /* z+v = r+x */ t = a = -(float)1.0/w; /* a = -1.0/w */ - GET_FLOAT_WORD(i,t); - SET_FLOAT_WORD(t,i&0xfffff000); + MASK_FLOAT(t, 0xfffff000); s = (float)1.0+t*z; return t+a*(s+t*v); } diff --git a/sysdeps/ieee754/flt-32/s_modff.c b/sysdeps/ieee754/flt-32/s_modff.c index 23f6a90..491f50f 100644 --- a/sysdeps/ieee754/flt-32/s_modff.c +++ b/sysdeps/ieee754/flt-32/s_modff.c @@ -32,10 +32,8 @@ __modff(float x, float *iptr) } else { i = (0x007fffff)>>j0; if((i0&i)==0) { /* x is integral */ - u_int32_t ix; *iptr = x; - GET_FLOAT_WORD(ix,x); - SET_FLOAT_WORD(x,ix&0x80000000); /* return +-0 */ + MASK_FLOAT(x,0x80000000); /* return +-0 */ return x; } else { SET_FLOAT_WORD(*iptr,i0&(~i)); diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h b/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h new file mode 100644 index 0000000..700e410 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h @@ -0,0 +1,47 @@ +/* Private inline math functions for POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Faster to do an in-place masking of the float number in the VSR + than move to GPR for the masking and back. maskl, maskr, and maski + are used to convert the 32-bit "mask" parameter to a 64-bit mask + suitable for the internal representation of a scalar + single-precision floating point number in the Power8 processor. + Note: before applying the mask, xvmovdp is used to ensure f is + normalized. */ +#define MASK_FLOAT(f, mask) \ + do { \ + long tmpmask = mask; \ + float tmpf = f; \ + long maskl = 0xc000000000000000; \ + long maskr = 0x3fffffffffffffff; \ + long maski = 0x3800000000000000; \ + tmpmask = tmpmask << 32; \ + tmpmask = ((tmpmask&maskl) | ((tmpmask&maskr)>>3) | maski); \ + union { \ + long l; \ + double d; \ + } umask = {.l = tmpmask}; \ + __asm__ ("xvmovdp %x2, %x2\n\t" \ + "xxland %x0, %x2, %1\n\t" \ + : "=wa" (tmpf) \ + : "d" (umask.d), \ + "wa" (tmpf) ); \ + f = tmpf; \ + } while(0) + +#include_next