powerpc: Add a POWER8-optimized version of powf()

Message ID	8c7e16e8-d605-9552-89cc-c658dba81a6f@us.ibm.com
State	Superseded
Headers	Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Reply-To: pc@us.ibm.com To: libc-alpha@sourceware.org From: Paul Clarke <pc@us.ibm.com> Subject: [PATCH] powerpc: Add a POWER8-optimized version of powf() Date: Thu, 25 May 2017 12:47:30 -0500 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.8.0 MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 7bit Message-Id: <8c7e16e8-d605-9552-89cc-c658dba81a6f@us.ibm.com>

Message ID

8c7e16e8-d605-9552-89cc-c658dba81a6f@us.ibm.com

State

Superseded

Headers

Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
Sender: libc-alpha-owner@sourceware.org
Reply-To: pc@us.ibm.com
To: libc-alpha@sourceware.org
From: Paul Clarke <pc@us.ibm.com>
Subject: [PATCH] powerpc: Add a POWER8-optimized version of powf()
Date: Thu, 25 May 2017 12:47:30 -0500
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101
	Thunderbird/45.8.0
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
Message-Id: <8c7e16e8-d605-9552-89cc-c658dba81a6f@us.ibm.com>

Commit Message

Paul A. Clarke May 25, 2017, 5:47 p.m. UTC

  This implementation is heavily based on sysdeps/ieee754/flt-32/e_powf.c.
Most significant changes are code simplification and use of doubles for
intermediate values.  Also, some rearrangement to move early
non-dependent code later, out of the faster paths.

2017-05-25  Paul A. Clarke  <pc@us.ibm.com>

	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
	[$(subdir) = math] (libm-sysdep_routines): Add e_powf-power8 and
	e_powf-ppc64.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c: New file.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c: Likewise.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c: Likewise.
	* sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c: Likewise.
---
 sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile   |   3 +-
 .../multiarch/{s_cosf-ppc64.c => e_powf-power8.c}  |  12 +-
 .../powerpc64/fpu/multiarch/e_powf-ppc64.c}        |   9 +-
 .../powerpc64/fpu/multiarch/{s_cosf.c => e_powf.c} |  16 +--
 .../powerpc64/power8/fpu}/e_powf.c                 | 140 +++++++--------------
 5 files changed, 69 insertions(+), 111 deletions(-)
 copy sysdeps/powerpc/powerpc64/fpu/multiarch/{s_cosf-ppc64.c => e_powf-power8.c} (80%)
 copy sysdeps/{i386/symbol-hacks.h => powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c} (82%)
 copy sysdeps/powerpc/powerpc64/fpu/multiarch/{s_cosf.c => e_powf.c} (71%)
 copy sysdeps/{ieee754/flt-32 => powerpc/powerpc64/power8/fpu}/e_powf.c (69%)

Comments

Adhemerval Zanella May 26, 2017, 7:55 p.m. UTC | #1

On 25/05/2017 14:47, Paul Clarke wrote:
> This implementation is heavily based on sysdeps/ieee754/flt-32/e_powf.c.
> Most significant changes are code simplification and use of doubles for
> intermediate values.  Also, some rearrangement to move early
> non-dependent code later, out of the faster paths.
> 
> 2017-05-25  Paul A. Clarke  <pc@us.ibm.com>
> 
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> 	[$(subdir) = math] (libm-sysdep_routines): Add e_powf-power8 and
> 	e_powf-ppc64.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c: New file.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c: Likewise.
> 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c: Likewise.
> 	* sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c: Likewise.

This changes seems to be arch independent and I would like to avoid adding
even more arch specific.  Is there any reason why this can't be used as
the default implementation?  Do you have number on different architecture
for it? 

> ---
>  sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile   |   3 +-
>  .../multiarch/{s_cosf-ppc64.c => e_powf-power8.c}  |  12 +-
>  .../powerpc64/fpu/multiarch/e_powf-ppc64.c}        |   9 +-
>  .../powerpc64/fpu/multiarch/{s_cosf.c => e_powf.c} |  16 +--
>  .../powerpc64/power8/fpu}/e_powf.c                 | 140 +++++++--------------
>  5 files changed, 69 insertions(+), 111 deletions(-)
>  copy sysdeps/powerpc/powerpc64/fpu/multiarch/{s_cosf-ppc64.c => e_powf-power8.c} (80%)
>  copy sysdeps/{i386/symbol-hacks.h => powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c} (82%)
>  copy sysdeps/powerpc/powerpc64/fpu/multiarch/{s_cosf.c => e_powf.c} (71%)
>  copy sysdeps/{ieee754/flt-32 => powerpc/powerpc64/power8/fpu}/e_powf.c (69%)
> 
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> index 317a988..dc7422e 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> @@ -27,7 +27,8 @@ libm-sysdep_routines += s_isnan-power7 s_isnan-power6x s_isnan-power6 \
>  			s_llrint-power8 s_llround-power8 \
>  			e_expf-power8 e_expf-ppc64 \
>  			s_sinf-ppc64 s_sinf-power8 \
> -			s_cosf-ppc64 s_cosf-power8
> +			s_cosf-ppc64 s_cosf-power8 \
> +			e_powf-ppc64 e_powf-power8
>  
>  CFLAGS-s_logbf-power7.c = -mcpu=power7
>  CFLAGS-s_logbl-power7.c = -mcpu=power7
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
> similarity index 80%
> copy from sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c
> copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
> index 635624c..5dfd969 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
> @@ -1,4 +1,4 @@
> -/* cosf function.  PowerPC64 default version.
> +/* __ieee754_powf() POWER8 version.
>     Copyright (C) 2017 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -16,11 +16,9 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -#include <sysdep.h>
> +#undef strong_alias
> +#define strong_alias(a, b)
>  
> -#undef weak_alias
> -#define weak_alias(a, b)
> +#define __ieee754_powf __ieee754_powf_power8
>  
> -#define __cosf __cosf_ppc64
> -
> -#include <sysdeps/powerpc/fpu/s_cosf.c>
> +#include <sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c>
> diff --git a/sysdeps/i386/symbol-hacks.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
> similarity index 82%
> copy from sysdeps/i386/symbol-hacks.h
> copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
> index 36a13c8..43b466d 100644
> --- a/sysdeps/i386/symbol-hacks.h
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
> @@ -1,4 +1,4 @@
> -/* Hacks needed for symbol manipulation.  i386 version.
> +/* __ieee_powf() PowerPC64 version.
>     Copyright (C) 2017 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -16,6 +16,9 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -#include <sysdeps/wordsize-32/divdi3-symbol-hacks.h>
> +#undef strong_alias
> +#define strong_alias(a, b)
>  
> -#include_next "symbol-hacks.h"
> +#define __ieee754_powf __ieee754_powf_ppc64
> +
> +#include <sysdeps/ieee754/flt-32/e_powf.c>
> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
> similarity index 71%
> copy from sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c
> copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
> index acf2a59..325d166 100644
> --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c
> +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
> @@ -1,4 +1,4 @@
> -/* Multiple versions of cosf.
> +/* Multiple versions of ieee754_powf.
>     Copyright (C) 2017 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>  
> @@ -17,15 +17,15 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #include <math.h>
> -#include <shlib-compat.h>
> +#include <math_private.h>
>  #include "init-arch.h"
>  
> -extern __typeof (__cosf) __cosf_ppc64 attribute_hidden;
> -extern __typeof (__cosf) __cosf_power8 attribute_hidden;
> +extern __typeof (__ieee754_powf) __ieee754_powf_ppc64 attribute_hidden;
> +extern __typeof (__ieee754_powf) __ieee754_powf_power8 attribute_hidden;
>  
> -libc_ifunc (__cosf,
> +libc_ifunc (__ieee754_powf,
>  	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> -	    ? __cosf_power8
> -	    : __cosf_ppc64);
> +	    ? __ieee754_powf_power8
> +	    : __ieee754_powf_ppc64);
>  
> -weak_alias (__cosf, cosf)
> +strong_alias (__ieee754_powf, __powf_finite)
> diff --git a/sysdeps/ieee754/flt-32/e_powf.c b/sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
> similarity index 69%
> copy from sysdeps/ieee754/flt-32/e_powf.c
> copy to sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
> index 13b49de..628232a 100644
> --- a/sysdeps/ieee754/flt-32/e_powf.c
> +++ b/sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
> @@ -4,7 +4,7 @@
>  
>  /*
>   * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> + * Copyright (C) 1993-2017 by Sun Microsystems, Inc. All rights reserved.
>   *
>   * Developed at SunPro, a Sun Microsystems, Inc. business.
>   * Permission to use, copy, modify, and distribute this
> @@ -38,11 +38,9 @@ P2   = -2.7777778450e-03, /* 0xbb360b61 */
>  P3   =  6.6137559770e-05, /* 0x388ab355 */
>  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
>  P5   =  4.1381369442e-08, /* 0x3331bb4c */
> -lg2  =  6.9314718246e-01, /* 0x3f317218 */
>  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
>  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
>  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
> -cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
>  cp_h  =  0xf.64p-4, /* cp high 12 bits.  */
>  cp_l  =  -0x7.b11e3p-16, /* 2/(3ln2) - cp_h.  */
>  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
> @@ -52,14 +50,13 @@ ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
>  float
>  __ieee754_powf(float x, float y)
>  {
> -	float z,ax,z_h,z_l,p_h,p_l;
> -	float y1,t1,t2,r,s,t,u,v,w;
> +	float z, ax, s;
> +	double d1, d2;
>  	int32_t i,j,k,yisint,n;
> -	int32_t hx,hy,ix,iy,is;
> +	int32_t hx,hy,ix,iy;
>  
> -	GET_FLOAT_WORD(hx,x);
>  	GET_FLOAT_WORD(hy,y);
> -	ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
> +	iy = hy&0x7fffffff;
>  
>      /* y==zero: x**0 = 1 */
>  	if(iy==0 && !issignaling (x)) return one;
> @@ -68,26 +65,14 @@ __ieee754_powf(float x, float y)
>  	if(x == 1.0 && !issignaling (y)) return one;
>  	if(x == -1.0 && isinf(y)) return one;
>  
> +	GET_FLOAT_WORD(hx,x);
> +	ix = hx&0x7fffffff;
> +
>      /* +-NaN return x+y */
>  	if(__builtin_expect(ix > 0x7f800000 ||
>  			    iy > 0x7f800000, 0))
>  		return x+y;
>  
> -    /* determine if y is an odd int when x < 0
> -     * yisint = 0	... y is not an integer
> -     * yisint = 1	... y is an odd int
> -     * yisint = 2	... y is an even int
> -     */
> -	yisint  = 0;
> -	if(hx<0) {
> -	    if(iy>=0x4b800000) yisint = 2; /* even integer y */
> -	    else if(iy>=0x3f800000) {
> -		k = (iy>>23)-0x7f;	   /* exponent */
> -		j = iy>>(23-k);
> -		if((j<<(23-k))==iy) yisint = 2-(j&1);
> -	    }
> -	}
> -
>      /* special value of y */
>  	if (__builtin_expect(iy==0x7f800000, 0)) {	/* y is +-inf */
>  	    if (ix==0x3f800000)
> @@ -106,6 +91,21 @@ __ieee754_powf(float x, float y)
>  	    return __ieee754_sqrtf(x);
>  	}
>  
> +    /* determine if y is an odd int when x < 0
> +     * yisint = 0	... y is not an integer
> +     * yisint = 1	... y is an odd int
> +     * yisint = 2	... y is an even int
> +     */
> +	yisint  = 0;
> +	if(hx<0) {
> +	    if(iy>=0x4b800000) yisint = 2; /* even integer y */
> +	    else if(iy>=0x3f800000) {
> +		k = (iy>>23)-0x7f;	   /* exponent */
> +		j = iy>>(23-k);
> +		if((j<<(23-k))==iy) yisint = 2-(j&1);
> +	    }
> +	}
> +
>  	ax   = fabsf(x);
>      /* special value of x */
>  	if(__builtin_expect(ix==0x7f800000||ix==0||ix==0x3f800000, 0)){
> @@ -131,16 +131,10 @@ __ieee754_powf(float x, float y)
>  	    if(ix>0x3f800007) return (hy>0)? huge*huge:tiny*tiny;
>  	/* now |1-x| is tiny <= 2**-20, suffice to compute
>  	   log(x) by x-x^2/2+x^3/3-x^4/4 */
> -	    t = ax-1;		/* t has 20 trailing zeros */
> -	    w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
> -	    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
> -	    v = t*ivln2_l-w*ivln2;
> -	    t1 = u+v;
> -	    GET_FLOAT_WORD(is,t1);
> -	    SET_FLOAT_WORD(t1,is&0xfffff000);
> -	    t2 = v-(t1-u);
> +	    d2 = ax-1;		/* d2 has 20 trailing zeros.  */
> +	    d2 = d2 * (ivln2_l + (double)ivln2_h) -
> +		 (d2 * d2) * (0.5 - d2 * (0.333333333333 - d2 * 0.25)) * ivln2;
>  	} else {
> -	    float s2,s_h,s_l,t_h,t_l;
>  	    /* Avoid internal underflow for tiny y.  The exact value
>  	       of y does not matter if |y| <= 2**-32.  */
>  	    if (iy < 0x2f800000)
> @@ -158,69 +152,37 @@ __ieee754_powf(float x, float y)
>  	    else {k=0;n+=1;ix -= 0x00800000;}
>  	    SET_FLOAT_WORD(ax,ix);
>  
> -	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
> -	    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
> -	    v = one/(ax+bp[k]);
> -	    s = u*v;
> -	    s_h = s;
> -	    GET_FLOAT_WORD(is,s_h);
> -	    SET_FLOAT_WORD(s_h,is&0xfffff000);
> -	/* t_h=ax+bp[k] High */
> -	    SET_FLOAT_WORD (t_h,
> -			    ((((ix>>1)|0x20000000)+0x00400000+(k<<21))
> -			     & 0xfffff000));
> -	    t_l = ax - (t_h-bp[k]);
> -	    s_l = v*((u-s_h*t_h)-s_h*t_l);
> -	/* compute log(ax) */
> -	    s2 = s*s;
> -	    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
> -	    r += s_l*(s_h+s);
> -	    s2  = s_h*s_h;
> -	    t_h = (float)3.0+s2+r;
> -	    GET_FLOAT_WORD(is,t_h);
> -	    SET_FLOAT_WORD(t_h,is&0xfffff000);
> -	    t_l = r-((t_h-(float)3.0)-s2);
> -	/* u+v = s*(1+...) */
> -	    u = s_h*t_h;
> -	    v = s_l*t_h+t_l*s;
> -	/* 2/(3log2)*(s+...) */
> -	    p_h = u+v;
> -	    GET_FLOAT_WORD(is,p_h);
> -	    SET_FLOAT_WORD(p_h,is&0xfffff000);
> -	    p_l = v-(p_h-u);
> -	    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
> -	    z_l = cp_l*p_h+p_l*cp+dp_l[k];
> -	/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
> -	    t = (float)n;
> -	    t1 = (((z_h+z_l)+dp_h[k])+t);
> -	    GET_FLOAT_WORD(is,t1);
> -	    SET_FLOAT_WORD(t1,is&0xfffff000);
> -	    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
> +	/* compute d1 = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
> +	    d1 = (ax-(double)bp[k])/(ax+(double)bp[k]);
> +	/* compute d2 = log(ax) */
> +	    d2 = d1 * d1;
> +	    d2 = 3.0 + d2 + d2*d2*(L1+d2*(L2+d2*(L3+d2*(L4+d2*(L5+d2*L6)))));
> +	/* 2/(3log2)*(d2+...) */
> +	    d2 = d1*d2*((double)cp_h + cp_l);
> +	/* log2(ax) = (d2+..)*2/(3*log2) */
> +	    d2 = d2+((double)dp_h[k]+dp_l[k])+(double)n;
>  	}
>  
>  	s = one; /* s (sign of result -ve**odd) = -1 else = 1 */
>  	if(((((u_int32_t)hx>>31)-1)|(yisint-1))==0)
>  	    s = -one;	/* (-ve)**(odd int) */
>  
> -    /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
> -	GET_FLOAT_WORD(is,y);
> -	SET_FLOAT_WORD(y1,is&0xfffff000);
> -	p_l = (y-y1)*t1+y*t2;
> -	p_h = y1*t1;
> -	z = p_l+p_h;
> +    /* compute y * d2 */
> +	d1 = y * d2;
> +	z = d1;
>  	GET_FLOAT_WORD(j,z);
>  	if (__builtin_expect(j>0x43000000, 0))		/* if z > 128 */
>  	    return s*huge*huge;				/* overflow */
>  	else if (__builtin_expect(j==0x43000000, 0)) {	/* if z == 128 */
> -	    if(p_l+ovt>z-p_h) return s*huge*huge;	/* overflow */
> +	    if(ovt>(z-d1)) return s*huge*huge;	/* overflow */
>  	}
>  	else if (__builtin_expect((j&0x7fffffff)>0x43160000, 0))/* z <= -150 */
>  	    return s*tiny*tiny;				/* underflow */
>  	else if (__builtin_expect((u_int32_t) j==0xc3160000, 0)){/* z == -150*/
> -	    if(p_l<=z-p_h) return s*tiny*tiny;		/* underflow */
> +	    if(0.0<=(z-d1)) return s*tiny*tiny;		/* underflow */
>  	}
>      /*
> -     * compute 2**(p_h+p_l)
> +     * compute 2**d1
>       */
>  	i = j&0x7fffffff;
>  	k = (i>>23)-0x7f;
> @@ -228,22 +190,16 @@ __ieee754_powf(float x, float y)
>  	if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
>  	    n = j+(0x00800000>>(k+1));
>  	    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
> -	    SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
> +	    SET_FLOAT_WORD(z,n&~(0x007fffff>>k));
>  	    n = ((n&0x007fffff)|0x00800000)>>(23-k);
>  	    if(j<0) n = -n;
> -	    p_h -= t;
> +	    d1 -= z;
>  	}
> -	t = p_l+p_h;
> -	GET_FLOAT_WORD(is,t);
> -	SET_FLOAT_WORD(t,is&0xfffff000);
> -	u = t*lg2_h;
> -	v = (p_l-(t-p_h))*lg2+t*lg2_l;
> -	z = u+v;
> -	w = v-(z-u);
> -	t  = z*z;
> -	t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
> -	r  = (z*t1)/(t1-two)-(w+z*w);
> -	z  = one-(r-z);
> +	d1 = d1 * ((double)lg2_h + lg2_l);
> +	d2 = d1*d1;
> +	d2 = d1 - d2*(P1+d2*(P2+d2*(P3+d2*(P4+d2*P5))));
> +	d2 = (d1*d2)/(d2-two);
> +	z = one - (d2-d1);
>  	GET_FLOAT_WORD(j,z);
>  	j += (n<<23);
>  	if((j>>23)<=0)	/* subnormal output */
>

Steven Munroe May 27, 2017, 12:37 a.m. UTC | #2

On Fri, 2017-05-26 at 16:55 -0300, Adhemerval Zanella wrote:
> 
> On 25/05/2017 14:47, Paul Clarke wrote:
> > This implementation is heavily based on sysdeps/ieee754/flt-32/e_powf.c.
> > Most significant changes are code simplification and use of doubles for
> > intermediate values.  Also, some rearrangement to move early
> > non-dependent code later, out of the faster paths.
> > 
> > 2017-05-25  Paul A. Clarke  <pc@us.ibm.com>
> > 
> > 	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
> > 	[$(subdir) = math] (libm-sysdep_routines): Add e_powf-power8 and
> > 	e_powf-ppc64.
> > 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c: New file.
> > 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c: Likewise.
> > 	* sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c: Likewise.
> > 	* sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c: Likewise.
> 
> This changes seems to be arch independent and I would like to avoid adding
> even more arch specific.  Is there any reason why this can't be used as
> the default implementation?  Do you have number on different architecture
> for it? 
> 
If other platform maintainer what to try this implementation and report
that would be OK. 

But I don't this it is correct or fair to ask Paul to prove a negative.
These quests tend to be very labor intensive and usually don't work out
(as really common) in the end.

diff mbox

Patch

diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
index 317a988..dc7422e 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
@@ -27,7 +27,8 @@  libm-sysdep_routines += s_isnan-power7 s_isnan-power6x s_isnan-power6 \
 			s_llrint-power8 s_llround-power8 \
 			e_expf-power8 e_expf-ppc64 \
 			s_sinf-ppc64 s_sinf-power8 \
-			s_cosf-ppc64 s_cosf-power8
+			s_cosf-ppc64 s_cosf-power8 \
+			e_powf-ppc64 e_powf-power8
 
 CFLAGS-s_logbf-power7.c = -mcpu=power7
 CFLAGS-s_logbl-power7.c = -mcpu=power7
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
similarity index 80%
copy from sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c
copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
index 635624c..5dfd969 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-ppc64.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-power8.c
@@ -1,4 +1,4 @@ 
-/* cosf function.  PowerPC64 default version.
+/* __ieee754_powf() POWER8 version.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,11 +16,9 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+#undef strong_alias
+#define strong_alias(a, b)
 
-#undef weak_alias
-#define weak_alias(a, b)
+#define __ieee754_powf __ieee754_powf_power8
 
-#define __cosf __cosf_ppc64
-
-#include <sysdeps/powerpc/fpu/s_cosf.c>
+#include <sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c>
diff --git a/sysdeps/i386/symbol-hacks.h b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
similarity index 82%
copy from sysdeps/i386/symbol-hacks.h
copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
index 36a13c8..43b466d 100644
--- a/sysdeps/i386/symbol-hacks.h
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf-ppc64.c
@@ -1,4 +1,4 @@ 
-/* Hacks needed for symbol manipulation.  i386 version.
+/* __ieee_powf() PowerPC64 version.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,6 +16,9 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdeps/wordsize-32/divdi3-symbol-hacks.h>
+#undef strong_alias
+#define strong_alias(a, b)
 
-#include_next "symbol-hacks.h"
+#define __ieee754_powf __ieee754_powf_ppc64
+
+#include <sysdeps/ieee754/flt-32/e_powf.c>
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
similarity index 71%
copy from sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c
copy to sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
index acf2a59..325d166 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf.c
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_powf.c
@@ -1,4 +1,4 @@ 
-/* Multiple versions of cosf.
+/* Multiple versions of ieee754_powf.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,15 +17,15 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <math.h>
-#include <shlib-compat.h>
+#include <math_private.h>
 #include "init-arch.h"
 
-extern __typeof (__cosf) __cosf_ppc64 attribute_hidden;
-extern __typeof (__cosf) __cosf_power8 attribute_hidden;
+extern __typeof (__ieee754_powf) __ieee754_powf_ppc64 attribute_hidden;
+extern __typeof (__ieee754_powf) __ieee754_powf_power8 attribute_hidden;
 
-libc_ifunc (__cosf,
+libc_ifunc (__ieee754_powf,
 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-	    ? __cosf_power8
-	    : __cosf_ppc64);
+	    ? __ieee754_powf_power8
+	    : __ieee754_powf_ppc64);
 
-weak_alias (__cosf, cosf)
+strong_alias (__ieee754_powf, __powf_finite)
diff --git a/sysdeps/ieee754/flt-32/e_powf.c b/sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
similarity index 69%
copy from sysdeps/ieee754/flt-32/e_powf.c
copy to sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
index 13b49de..628232a 100644
--- a/sysdeps/ieee754/flt-32/e_powf.c
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/e_powf.c
@@ -4,7 +4,7 @@ 
 
 /*
  * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ * Copyright (C) 1993-2017 by Sun Microsystems, Inc. All rights reserved.
  *
  * Developed at SunPro, a Sun Microsystems, Inc. business.
  * Permission to use, copy, modify, and distribute this
@@ -38,11 +38,9 @@  P2   = -2.7777778450e-03, /* 0xbb360b61 */
 P3   =  6.6137559770e-05, /* 0x388ab355 */
 P4   = -1.6533901999e-06, /* 0xb5ddea0e */
 P5   =  4.1381369442e-08, /* 0x3331bb4c */
-lg2  =  6.9314718246e-01, /* 0x3f317218 */
 lg2_h  =  6.93145752e-01, /* 0x3f317200 */
 lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
 ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
-cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
 cp_h  =  0xf.64p-4, /* cp high 12 bits.  */
 cp_l  =  -0x7.b11e3p-16, /* 2/(3ln2) - cp_h.  */
 ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
@@ -52,14 +50,13 @@  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
 float
 __ieee754_powf(float x, float y)
 {
-	float z,ax,z_h,z_l,p_h,p_l;
-	float y1,t1,t2,r,s,t,u,v,w;
+	float z, ax, s;
+	double d1, d2;
 	int32_t i,j,k,yisint,n;
-	int32_t hx,hy,ix,iy,is;
+	int32_t hx,hy,ix,iy;
 
-	GET_FLOAT_WORD(hx,x);
 	GET_FLOAT_WORD(hy,y);
-	ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
+	iy = hy&0x7fffffff;
 
     /* y==zero: x**0 = 1 */
 	if(iy==0 && !issignaling (x)) return one;
@@ -68,26 +65,14 @@  __ieee754_powf(float x, float y)
 	if(x == 1.0 && !issignaling (y)) return one;
 	if(x == -1.0 && isinf(y)) return one;
 
+	GET_FLOAT_WORD(hx,x);
+	ix = hx&0x7fffffff;
+
     /* +-NaN return x+y */
 	if(__builtin_expect(ix > 0x7f800000 ||
 			    iy > 0x7f800000, 0))
 		return x+y;
 
-    /* determine if y is an odd int when x < 0
-     * yisint = 0	... y is not an integer
-     * yisint = 1	... y is an odd int
-     * yisint = 2	... y is an even int
-     */
-	yisint  = 0;
-	if(hx<0) {
-	    if(iy>=0x4b800000) yisint = 2; /* even integer y */
-	    else if(iy>=0x3f800000) {
-		k = (iy>>23)-0x7f;	   /* exponent */
-		j = iy>>(23-k);
-		if((j<<(23-k))==iy) yisint = 2-(j&1);
-	    }
-	}
-
     /* special value of y */
 	if (__builtin_expect(iy==0x7f800000, 0)) {	/* y is +-inf */
 	    if (ix==0x3f800000)
@@ -106,6 +91,21 @@  __ieee754_powf(float x, float y)
 	    return __ieee754_sqrtf(x);
 	}
 
+    /* determine if y is an odd int when x < 0
+     * yisint = 0	... y is not an integer
+     * yisint = 1	... y is an odd int
+     * yisint = 2	... y is an even int
+     */
+	yisint  = 0;
+	if(hx<0) {
+	    if(iy>=0x4b800000) yisint = 2; /* even integer y */
+	    else if(iy>=0x3f800000) {
+		k = (iy>>23)-0x7f;	   /* exponent */
+		j = iy>>(23-k);
+		if((j<<(23-k))==iy) yisint = 2-(j&1);
+	    }
+	}
+
 	ax   = fabsf(x);
     /* special value of x */
 	if(__builtin_expect(ix==0x7f800000||ix==0||ix==0x3f800000, 0)){
@@ -131,16 +131,10 @@  __ieee754_powf(float x, float y)
 	    if(ix>0x3f800007) return (hy>0)? huge*huge:tiny*tiny;
 	/* now |1-x| is tiny <= 2**-20, suffice to compute
 	   log(x) by x-x^2/2+x^3/3-x^4/4 */
-	    t = ax-1;		/* t has 20 trailing zeros */
-	    w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
-	    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
-	    v = t*ivln2_l-w*ivln2;
-	    t1 = u+v;
-	    GET_FLOAT_WORD(is,t1);
-	    SET_FLOAT_WORD(t1,is&0xfffff000);
-	    t2 = v-(t1-u);
+	    d2 = ax-1;		/* d2 has 20 trailing zeros.  */
+	    d2 = d2 * (ivln2_l + (double)ivln2_h) -
+		 (d2 * d2) * (0.5 - d2 * (0.333333333333 - d2 * 0.25)) * ivln2;
 	} else {
-	    float s2,s_h,s_l,t_h,t_l;
 	    /* Avoid internal underflow for tiny y.  The exact value
 	       of y does not matter if |y| <= 2**-32.  */
 	    if (iy < 0x2f800000)
@@ -158,69 +152,37 @@  __ieee754_powf(float x, float y)
 	    else {k=0;n+=1;ix -= 0x00800000;}
 	    SET_FLOAT_WORD(ax,ix);
 
-	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
-	    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
-	    v = one/(ax+bp[k]);
-	    s = u*v;
-	    s_h = s;
-	    GET_FLOAT_WORD(is,s_h);
-	    SET_FLOAT_WORD(s_h,is&0xfffff000);
-	/* t_h=ax+bp[k] High */
-	    SET_FLOAT_WORD (t_h,
-			    ((((ix>>1)|0x20000000)+0x00400000+(k<<21))
-			     & 0xfffff000));
-	    t_l = ax - (t_h-bp[k]);
-	    s_l = v*((u-s_h*t_h)-s_h*t_l);
-	/* compute log(ax) */
-	    s2 = s*s;
-	    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
-	    r += s_l*(s_h+s);
-	    s2  = s_h*s_h;
-	    t_h = (float)3.0+s2+r;
-	    GET_FLOAT_WORD(is,t_h);
-	    SET_FLOAT_WORD(t_h,is&0xfffff000);
-	    t_l = r-((t_h-(float)3.0)-s2);
-	/* u+v = s*(1+...) */
-	    u = s_h*t_h;
-	    v = s_l*t_h+t_l*s;
-	/* 2/(3log2)*(s+...) */
-	    p_h = u+v;
-	    GET_FLOAT_WORD(is,p_h);
-	    SET_FLOAT_WORD(p_h,is&0xfffff000);
-	    p_l = v-(p_h-u);
-	    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
-	    z_l = cp_l*p_h+p_l*cp+dp_l[k];
-	/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
-	    t = (float)n;
-	    t1 = (((z_h+z_l)+dp_h[k])+t);
-	    GET_FLOAT_WORD(is,t1);
-	    SET_FLOAT_WORD(t1,is&0xfffff000);
-	    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+	/* compute d1 = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+	    d1 = (ax-(double)bp[k])/(ax+(double)bp[k]);
+	/* compute d2 = log(ax) */
+	    d2 = d1 * d1;
+	    d2 = 3.0 + d2 + d2*d2*(L1+d2*(L2+d2*(L3+d2*(L4+d2*(L5+d2*L6)))));
+	/* 2/(3log2)*(d2+...) */
+	    d2 = d1*d2*((double)cp_h + cp_l);
+	/* log2(ax) = (d2+..)*2/(3*log2) */
+	    d2 = d2+((double)dp_h[k]+dp_l[k])+(double)n;
 	}
 
 	s = one; /* s (sign of result -ve**odd) = -1 else = 1 */
 	if(((((u_int32_t)hx>>31)-1)|(yisint-1))==0)
 	    s = -one;	/* (-ve)**(odd int) */
 
-    /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
-	GET_FLOAT_WORD(is,y);
-	SET_FLOAT_WORD(y1,is&0xfffff000);
-	p_l = (y-y1)*t1+y*t2;
-	p_h = y1*t1;
-	z = p_l+p_h;
+    /* compute y * d2 */
+	d1 = y * d2;
+	z = d1;
 	GET_FLOAT_WORD(j,z);
 	if (__builtin_expect(j>0x43000000, 0))		/* if z > 128 */
 	    return s*huge*huge;				/* overflow */
 	else if (__builtin_expect(j==0x43000000, 0)) {	/* if z == 128 */
-	    if(p_l+ovt>z-p_h) return s*huge*huge;	/* overflow */
+	    if(ovt>(z-d1)) return s*huge*huge;	/* overflow */
 	}
 	else if (__builtin_expect((j&0x7fffffff)>0x43160000, 0))/* z <= -150 */
 	    return s*tiny*tiny;				/* underflow */
 	else if (__builtin_expect((u_int32_t) j==0xc3160000, 0)){/* z == -150*/
-	    if(p_l<=z-p_h) return s*tiny*tiny;		/* underflow */
+	    if(0.0<=(z-d1)) return s*tiny*tiny;		/* underflow */
 	}
     /*
-     * compute 2**(p_h+p_l)
+     * compute 2**d1
      */
 	i = j&0x7fffffff;
 	k = (i>>23)-0x7f;
@@ -228,22 +190,16 @@  __ieee754_powf(float x, float y)
 	if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
 	    n = j+(0x00800000>>(k+1));
 	    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
-	    SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+	    SET_FLOAT_WORD(z,n&~(0x007fffff>>k));
 	    n = ((n&0x007fffff)|0x00800000)>>(23-k);
 	    if(j<0) n = -n;
-	    p_h -= t;
+	    d1 -= z;
 	}
-	t = p_l+p_h;
-	GET_FLOAT_WORD(is,t);
-	SET_FLOAT_WORD(t,is&0xfffff000);
-	u = t*lg2_h;
-	v = (p_l-(t-p_h))*lg2+t*lg2_l;
-	z = u+v;
-	w = v-(z-u);
-	t  = z*z;
-	t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
-	r  = (z*t1)/(t1-two)-(w+z*w);
-	z  = one-(r-z);
+	d1 = d1 * ((double)lg2_h + lg2_l);
+	d2 = d1*d1;
+	d2 = d1 - d2*(P1+d2*(P2+d2*(P3+d2*(P4+d2*P5))));
+	d2 = (d1*d2)/(d2-two);
+	z = one - (d2-d1);
 	GET_FLOAT_WORD(j,z);
 	j += (n<<23);
 	if((j>>23)<=0)	/* subnormal output */