From patchwork Tue Jun 28 19:27:53 2016
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Tulio Magno Quites Machado Filho
 <tuliom@linux.vnet.ibm.com>
X-Patchwork-Id: 13451
Received: (qmail 14259 invoked by alias); 28 Jun 2016 19:29:22 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 14237 invoked by uid 89); 28 Jun 2016 19:29:21 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.1 required=5.0 tests=AWL, BAYES_00,
	KAM_LAZY_DOMAIN_SECURITY,
	RCVD_IN_DNSWL_LOW autolearn=no version=3.3.2 spammy=iz, VSX,
	pS2, ps2
X-HELO: mx0a-001b2d01.pphosted.com
X-IBM-Helo: d24dlp01.br.ibm.com
X-IBM-MailFrom: tuliom@linux.vnet.ibm.com
X-IBM-RcptTo: libc-alpha@sourceware.org
From: "Tulio Magno Quites Machado Filho" <tuliom@linux.vnet.ibm.com>
To: libc-alpha@sourceware.org
Cc: munroesj@linux.vnet.ibm.com, joseph@codesourcery.com
Subject: [PATCH] Add a new macro to mask a float
Date: Tue, 28 Jun 2016 16:27:53 -0300
X-TM-AS-MML: disable
X-Content-Scanned: Fidelis XPS MAILER
x-cbid: 16062819-0028-0000-0000-0000011CDAB8
X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused
x-cbparentid: 16062819-0029-0000-0000-000013C140AF
Message-Id: <1467142073-13886-1-git-send-email-tuliom@linux.vnet.ibm.com>
X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:, ,
	definitions=2016-06-28_12:, , signatures=0
X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0
	spamscore=0 suspectscore=4
	malwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam
	adjust=0 reason=mlx scancount=1 engine=8.0.1-1604210000
	definitions=main-1606280176

Defining a new macro allows architectures to provide more efficient
implementations than using a GET_FLOAT_WORD/SET_FLOAT_WORD pair.
As an example, POWER8 is able to mask the float directly in the VSX
without copying the data to a GPR and copying it back.

This patch introduces the new macro MASK_FLOAT.  The generic
implementation remains unchanged.

Tested on x86_64, ppc, ppc64, ppc64le and s390x.

2016-06-28  Tulio Magno Quites Machado Filho  <tuliom@linux.vnet.ibm.com>

	* sysdeps/generic/math_private.h (MASK_FLOAT): New macro.
	* sysdeps/ieee754/flt-32/e_acosf.c (__ieee754_acosf): Replace
	SET_FLOAT_WORD and GET_FLOAT_WORD sequence by MASK_FLOAT.
	* sysdeps/ieee754/flt-32/e_asinf.c (__ieee754_asinf): Likewise.
	* sysdeps/ieee754/flt-32/e_powf.c (__ieee754_powf): Likewise.
	* sysdeps/ieee754/flt-32/k_tanf.c (__kernel_tanf): Likewise.
	* sysdeps/ieee754/flt-32/s_modff.c (__modff): Likewise.
	* sysdeps/powerpc/powerpc64/power8/fpu/math_private.h: New file.
---
 sysdeps/generic/math_private.h                     | 14 +++++++
 sysdeps/ieee754/flt-32/e_acosf.c                   |  4 +-
 sysdeps/ieee754/flt-32/e_asinf.c                   |  4 +-
 sysdeps/ieee754/flt-32/e_powf.c                    | 18 +++------
 sysdeps/ieee754/flt-32/k_tanf.c                    |  7 +---
 sysdeps/ieee754/flt-32/s_modff.c                   |  4 +-
 .../powerpc/powerpc64/power8/fpu/math_private.h    | 47 ++++++++++++++++++++++
 7 files changed, 72 insertions(+), 26 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/power8/fpu/math_private.h

diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h
index cf1865d..21e272c 100644
--- a/sysdeps/generic/math_private.h
+++ b/sysdeps/generic/math_private.h
@@ -181,6 +181,20 @@ do {								\
 } while (0)
 #endif
 
+/* Apply an integer mask on a float.
+
+   The default implementation invokes the GET_FLOAT_WORD/SET_FLOAT_WORD
+   macro pair.  Note that this macro can only be used to apply an AND
+   mask supplied directly as a parameter.  */
+#ifndef MASK_FLOAT
+# define MASK_FLOAT(f,mask)		\
+do {					\
+  u_int32_t __tmp;			\
+  GET_FLOAT_WORD(__tmp, f);		\
+  SET_FLOAT_WORD(f, __tmp&mask);	\
+} while (0)
+#endif
+
 /* Get long double macros from a separate header.  */
 #include <math_ldbl.h>
 
diff --git a/sysdeps/ieee754/flt-32/e_acosf.c b/sysdeps/ieee754/flt-32/e_acosf.c
index 6f792f6..8b29e53 100644
--- a/sysdeps/ieee754/flt-32/e_acosf.c
+++ b/sysdeps/ieee754/flt-32/e_acosf.c
@@ -61,12 +61,10 @@ __ieee754_acosf(float x)
 	    w = r*s-pio2_lo;
 	    return pi - (float)2.0*(s+w);
 	} else {			/* x > 0.5 */
-	    int32_t idf;
 	    z = (one-x)*(float)0.5;
 	    s = __ieee754_sqrtf(z);
 	    df = s;
-	    GET_FLOAT_WORD(idf,df);
-	    SET_FLOAT_WORD(df,idf&0xfffff000);
+	    MASK_FLOAT(df,0xfffff000);
 	    c  = (z-df*df)/(s+df);
 	    p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
 	    q = one+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
diff --git a/sysdeps/ieee754/flt-32/e_asinf.c b/sysdeps/ieee754/flt-32/e_asinf.c
index 2ca2dbc..95e0a79 100644
--- a/sysdeps/ieee754/flt-32/e_asinf.c
+++ b/sysdeps/ieee754/flt-32/e_asinf.c
@@ -89,10 +89,8 @@ float __ieee754_asinf(float x)
 	if(ix>=0x3F79999A) {	/* if |x| > 0.975 */
 	    t = pio2_hi-(2.0f*(s+s*p)-pio2_lo);
 	} else {
-	    int32_t iw;
 	    w  = s;
-	    GET_FLOAT_WORD(iw,w);
-	    SET_FLOAT_WORD(w,iw&0xfffff000);
+	    MASK_FLOAT(w,0xfffff000);
 	    c  = (t-w*w)/(s+w);
 	    r  = p;
 	    p  = 2.0f*s*r-(pio2_lo-2.0f*c);
diff --git a/sysdeps/ieee754/flt-32/e_powf.c b/sysdeps/ieee754/flt-32/e_powf.c
index c72fe37..d62e877 100644
--- a/sysdeps/ieee754/flt-32/e_powf.c
+++ b/sysdeps/ieee754/flt-32/e_powf.c
@@ -136,8 +136,7 @@ __ieee754_powf(float x, float y)
 	    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
 	    v = t*ivln2_l-w*ivln2;
 	    t1 = u+v;
-	    GET_FLOAT_WORD(is,t1);
-	    SET_FLOAT_WORD(t1,is&0xfffff000);
+	    MASK_FLOAT(t1,0xfffff000);
 	    t2 = v-(t1-u);
 	} else {
 	    float s2,s_h,s_l,t_h,t_l;
@@ -163,8 +162,7 @@ __ieee754_powf(float x, float y)
 	    v = one/(ax+bp[k]);
 	    s = u*v;
 	    s_h = s;
-	    GET_FLOAT_WORD(is,s_h);
-	    SET_FLOAT_WORD(s_h,is&0xfffff000);
+	    MASK_FLOAT(s_h,0xfffff000);
 	/* t_h=ax+bp[k] High */
 	    SET_FLOAT_WORD (t_h,
 			    ((((ix>>1)|0x20000000)+0x00400000+(k<<21))
@@ -177,24 +175,21 @@ __ieee754_powf(float x, float y)
 	    r += s_l*(s_h+s);
 	    s2  = s_h*s_h;
 	    t_h = (float)3.0+s2+r;
-	    GET_FLOAT_WORD(is,t_h);
-	    SET_FLOAT_WORD(t_h,is&0xfffff000);
+	    MASK_FLOAT(t_h,0xfffff000);
 	    t_l = r-((t_h-(float)3.0)-s2);
 	/* u+v = s*(1+...) */
 	    u = s_h*t_h;
 	    v = s_l*t_h+t_l*s;
 	/* 2/(3log2)*(s+...) */
 	    p_h = u+v;
-	    GET_FLOAT_WORD(is,p_h);
-	    SET_FLOAT_WORD(p_h,is&0xfffff000);
+	    MASK_FLOAT(p_h,0xfffff000);
 	    p_l = v-(p_h-u);
 	    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
 	    z_l = cp_l*p_h+p_l*cp+dp_l[k];
 	/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
 	    t = (float)n;
 	    t1 = (((z_h+z_l)+dp_h[k])+t);
-	    GET_FLOAT_WORD(is,t1);
-	    SET_FLOAT_WORD(t1,is&0xfffff000);
+	    MASK_FLOAT(t1,0xfffff000);
 	    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
 	}
 
@@ -234,8 +229,7 @@ __ieee754_powf(float x, float y)
 	    p_h -= t;
 	}
 	t = p_l+p_h;
-	GET_FLOAT_WORD(is,t);
-	SET_FLOAT_WORD(t,is&0xfffff000);
+	MASK_FLOAT(t,0xfffff000);
 	u = t*lg2_h;
 	v = (p_l-(t-p_h))*lg2+t*lg2_l;
 	z = u+v;
diff --git a/sysdeps/ieee754/flt-32/k_tanf.c b/sysdeps/ieee754/flt-32/k_tanf.c
index 9f0e558..d805816 100644
--- a/sysdeps/ieee754/flt-32/k_tanf.c
+++ b/sysdeps/ieee754/flt-32/k_tanf.c
@@ -87,14 +87,11 @@ float __kernel_tanf(float x, float y, int iy)
 			   simply return -1.0/(x+r) here */
      /*  compute -1.0/(x+r) accurately */
 	    float a,t;
-	    int32_t i;
 	    z  = w;
-	    GET_FLOAT_WORD(i,z);
-	    SET_FLOAT_WORD(z,i&0xfffff000);
+	    MASK_FLOAT(z, 0xfffff000);
 	    v  = r-(z - x); 	/* z+v = r+x */
 	    t = a  = -(float)1.0/w;	/* a = -1.0/w */
-	    GET_FLOAT_WORD(i,t);
-	    SET_FLOAT_WORD(t,i&0xfffff000);
+	    MASK_FLOAT(t, 0xfffff000);
 	    s  = (float)1.0+t*z;
 	    return t+a*(s+t*v);
 	}
diff --git a/sysdeps/ieee754/flt-32/s_modff.c b/sysdeps/ieee754/flt-32/s_modff.c
index 23f6a90..491f50f 100644
--- a/sysdeps/ieee754/flt-32/s_modff.c
+++ b/sysdeps/ieee754/flt-32/s_modff.c
@@ -32,10 +32,8 @@ __modff(float x, float *iptr)
 	    } else {
 		i = (0x007fffff)>>j0;
 		if((i0&i)==0) {			/* x is integral */
-		    u_int32_t ix;
 		    *iptr = x;
-		    GET_FLOAT_WORD(ix,x);
-		    SET_FLOAT_WORD(x,ix&0x80000000);	/* return +-0 */
+		    MASK_FLOAT(x,0x80000000);	/* return +-0 */
 		    return x;
 		} else {
 		    SET_FLOAT_WORD(*iptr,i0&(~i));
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h b/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h
new file mode 100644
index 0000000..700e410
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/math_private.h
@@ -0,0 +1,47 @@
+/* Private inline math functions for POWER8.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Faster to do an in-place masking of the float number in the VSR
+   than move to GPR for the masking and back.  maskl, maskr, and maski
+   are used to convert the 32-bit "mask" parameter to a 64-bit mask
+   suitable for the internal representation of a scalar
+   single-precision floating point number in the Power8 processor.
+   Note: before applying the mask, xvmovdp is used to ensure f is
+   normalized.  */
+#define MASK_FLOAT(f, mask)					\
+  do {								\
+    long tmpmask = mask;					\
+    float tmpf = f;						\
+    long maskl = 0xc000000000000000;				\
+    long maskr = 0x3fffffffffffffff;				\
+    long maski = 0x3800000000000000;				\
+    tmpmask = tmpmask << 32;					\
+    tmpmask = ((tmpmask&maskl) | ((tmpmask&maskr)>>3) | maski);	\
+    union {							\
+      long l;							\
+      double d;							\
+    } umask = {.l = tmpmask};					\
+    __asm__ ("xvmovdp %x2, %x2\n\t"				\
+	     "xxland %x0, %x2, %1\n\t"				\
+	     : "=wa" (tmpf)					\
+	     : "d" (umask.d),					\
+	       "wa" (tmpf) );					\
+    f = tmpf;							\
+  } while(0)
+
+#include_next <math_private.h>