From patchwork Mon Jul 31 08:49:34 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Rajalakshmi S <raji@linux.vnet.ibm.com>
X-Patchwork-Id: 21817
X-Patchwork-Delegate: tuliom@linux.vnet.ibm.com
Received: (qmail 23315 invoked by alias); 31 Jul 2017 08:50:00 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 23196 invoked by uid 89); 31 Jul 2017 08:49:58 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-24.0 required=5.0 tests=AWL, BAYES_00,
	GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3,
	KAM_LAZY_DOMAIN_SECURITY,
	RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.2 spammy=sine, cc0
X-HELO: mx0a-001b2d01.pphosted.com
From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
To: libc-alpha@sourceware.org
Cc: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Subject: [PATCH 1/3] powerpc: Add a POWER8-optimized version of sincosf()
Date: Mon, 31 Jul 2017 14:19:34 +0530
In-Reply-To: <1501490976-25746-1-git-send-email-raji@linux.vnet.ibm.com>
References: <1501490976-25746-1-git-send-email-raji@linux.vnet.ibm.com>
X-TM-AS-MML: disable
x-cbid: 17073108-0040-0000-0000-0000034BFB45
X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused
x-cbparentid: 17073108-0041-0000-0000-00000CC87D92
Message-Id: <1501490976-25746-2-git-send-email-raji@linux.vnet.ibm.com>
X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:, ,
	definitions=2017-07-31_03:, , signatures=0
X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0
	spamscore=0 suspectscore=4
	malwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam
	adjust=0 reason=mlx scancount=1 engine=8.0.1-1706020000
	definitions=main-1707310153

This implementation is based on
sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-power8.S
sysdeps/powerpc/powerpc64/fpu/multiarch/s_cosf-power8.S.

2017-07-31  Paul Clarke <pc@us.ibm.com>
            Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
	[$(subdir) = math] (libm-sysdep_routines): Add s_sincosf-power8
	and s_sincosf-ppc64.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c:
	New file.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c:
	Likewise.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c: Likewise.
	* sysdeps/powerpc/powerpc64/power8/fpu/s_sincosf.c: Likewise.
---
 sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile   |   3 +-
 .../powerpc64/fpu/multiarch/s_sincosf-power8.c     |  26 +++
 .../powerpc64/fpu/multiarch/s_sincosf-ppc64.c      |  26 +++
 .../powerpc/powerpc64/fpu/multiarch/s_sincosf.c    |  31 +++
 sysdeps/powerpc/powerpc64/power8/fpu/s_sincosf.c   | 260 +++++++++++++++++++++
 5 files changed, 345 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c
 create mode 100644 sysdeps/powerpc/powerpc64/power8/fpu/s_sincosf.c

diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
index d6f14f360a..e570f087e8 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
@@ -27,7 +27,8 @@ libm-sysdep_routines += s_isnan-power7 s_isnan-power6x s_isnan-power6 \
 			s_llrint-power8 s_llround-power8 s_llroundf-ppc64 \
 			e_expf-power8 e_expf-ppc64 \
 			s_sinf-ppc64 s_sinf-power8 \
-			s_cosf-ppc64 s_cosf-power8
+			s_cosf-ppc64 s_cosf-power8 \
+			s_sincosf-ppc64 s_sincosf-power8
 
 CFLAGS-s_logbf-power7.c = -mcpu=power7
 CFLAGS-s_logbl-power7.c = -mcpu=power7
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c
new file mode 100644
index 0000000000..7ebcdbd6a4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c
@@ -0,0 +1,26 @@
+/* sincosf function.  PowerPC64/power8 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+
+#undef weak_alias
+#define weak_alias(a,b)
+
+#define __sincosf __sincosf_power8
+
+#include <sysdeps/powerpc/powerpc64/power8/fpu/s_sincosf.c>
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c
new file mode 100644
index 0000000000..e261bdc464
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c
@@ -0,0 +1,26 @@
+/* sincosf function.  PowerPC64 default version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef weak_alias
+#define weak_alias(a, b)
+
+#define __sincosf __sincosf_ppc64
+
+#include <sysdeps/ieee754/flt-32/s_sincosf.c>
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c
new file mode 100644
index 0000000000..ef71ea443f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c
@@ -0,0 +1,31 @@
+/* Multiple versions of sincosf.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <shlib-compat.h>
+#include "init-arch.h"
+
+extern __typeof (__sincosf) __sincosf_ppc64 attribute_hidden;
+extern __typeof (__sincosf) __sincosf_power8 attribute_hidden;
+
+libc_ifunc (__sincosf,
+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+	    ? __sincosf_power8
+	    : __sincosf_ppc64);
+
+weak_alias (__sincosf, sincosf)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_sincosf.c b/sysdeps/powerpc/powerpc64/power8/fpu/s_sincosf.c
new file mode 100644
index 0000000000..b61db3aac2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_sincosf.c
@@ -0,0 +1,260 @@
+/* Compute sine and cosine of argument.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <errno.h>
+#include <math_private.h>
+
+/* Chebyshev constants for sin & cos, range -PI/4 - PI/4.  */
+static const double C0 = -0x1.ffffffffe98ae000p-2;
+static const double C1 = 0x1.55555545c50c7000p-5;
+static const double C2 = -0x1.6c16b348b6874000p-10;
+static const double C3 = 0x1.a00eb9ac43cc0000p-16;
+static const double C4 = -0x1.23c97dd8844d7000p-22;
+static const double CC0 = -0x1.fffffff5cc6fd000p-2;
+static const double CC1 = 0x1.55514b178dac5000p-5;
+static const double S0 = -0x1.5555555551cd9000p-3;
+static const double S1 = 0x1.1111110c2688b000p-7;
+static const double S2 = -0x1.a019f8b4bd1f9000p-13;
+static const double S3 = 0x1.71d7264e6b5b4000p-19;
+static const double S4 = -0x1.a947e1674b58a000p-26;
+static const double SS0 = -0x1.555555543d49d000p-3;
+static const double SS1 = 0x1.110f475cec8c5000p-7;
+static const double SMALL = 0x1.0000000000000000p-50;
+static const double inv_PI_4 = 0x1.45f306dc9c883000p+0;
+static const double PI_2_hi = -0x1.921fb54400000000p+0;
+static const double PI_2_lo = -0x1.0b4611a626332000p-34;
+
+#define FLOAT_EXPONENT_SHIFT 23
+#define FLOAT_EXPONENT_BIAS 127
+
+static const double pio2_table[] = {
+  0 * M_PI_2,
+  1 * M_PI_2,
+  2 * M_PI_2,
+  3 * M_PI_2,
+  4 * M_PI_2,
+  5 * M_PI_2,
+  6 * M_PI_2,
+  7 * M_PI_2,
+  8 * M_PI_2,
+  9 * M_PI_2,
+  10 * M_PI_2
+};
+
+static const double invpio4_table[] = {
+  0x0.0000000000000000p+0,
+  0x1.45f306c000000000p+0,
+  0x1.c9c882a000000000p-28,
+  0x1.4fe13a8000000000p-58,
+  0x1.f47d4d0000000000p-85,
+  0x1.bb81b6c000000000p-112,
+  0x1.4acc9e0000000000p-142,
+  0x1.0e4107c000000000p-169,
+  0x1.ca2c756000000000p-196,
+  0x1.bd778ac000000000p-224,
+  0x1.b7246e0000000000p-255,
+  0x1.d2126e8000000000p-282,
+  0x1.7003248000000000p-310,
+  0x1.77504e8000000000p-338,
+  0x1.921cfe0000000000p-367,
+  0x1.deb1cb0000000000p-395,
+  0x1.29a73e0000000000p-423,
+  0x1.d1046be000000000p-448,
+  0x1.4baed10000000000p-477,
+  0x1.09d338e000000000p-504,
+  0x1.35a2f80000000000p-538,
+  0x1.f904e64000000000p-561,
+  0x1.d639830000000000p-591,
+  0x1.4ce7d24000000000p-617,
+  0x1.908bf16000000000p-644
+};
+
+static const int ones[] = { +1, -1 };
+
+static inline void
+reduced (const double theta, const unsigned long n, float *const sinx,
+	 float *const cosx, const unsigned long signbit)
+{
+  double sx, cx, theta2;
+  /* We are operating on |x|, so we need to add back the original
+   * signbit for sinf.  */
+  int sign_sin, sign_cos;
+  sign_sin = ones[((n >> 2) & 1) ^ signbit];
+  sign_cos = ones[((n + 2) >> 2) & 1];
+  theta2 = theta * theta;
+  /* Chebyshev polynomial of the form for sin and cos:
+   * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+   * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+  sx = S3 + theta2 * S4;	/* S3+x^2*S4.  */
+  sx = S2 + theta2 * sx;	/* S2+x^2*(S3+x^2*S4).  */
+  sx = S1 + theta2 * sx;	/* S1+x^2*(S2+x^2*(S3+x^2*S4)).  */
+  sx = S0 + theta2 * sx;	/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))).  */
+  /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+  sx = theta + theta * theta2 * sx;
+
+  cx = C3 + theta2 * C4;	/* C3+x^2*C4.  */
+  cx = C2 + theta2 * cx;	/* C2+x^2*(C3+x^2*C4).  */
+  cx = C1 + theta2 * cx;	/* C1+x^2*(C2+x^2*(C3+x^2*C4)).  */
+  cx = C0 + theta2 * cx;	/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))).  */
+  /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+  cx = 1.0 + theta2 * cx;
+
+  /* Add in the signbit and assign the result.  */
+  if ((n & 2) == 0)
+    {
+      *sinx = sign_sin * sx;
+      *cosx = sign_cos * cx;
+    }
+  else
+    {
+      *sinx = sign_sin * cx;
+      *cosx = sign_cos * sx;
+    }
+}
+
+void
+__sincosf (float x, float *sinx, float *cosx)
+{
+  double cx;
+  double theta = x;
+  double abstheta = fabs (theta);
+  /*  if |x|< Pi/4.  */
+  if (abstheta < M_PI_4)
+    {
+      if (abstheta >= +0.03125)	/* |x| >= 2^-5.  */
+	{
+	  double theta2 = theta * theta;
+	  /* Chebyshev polynomial of the form for sin and cos.  */
+	  cx = C3 + theta2 * C4;
+	  cx = C2 + theta2 * cx;
+	  cx = C1 + theta2 * cx;
+	  cx = C0 + theta2 * cx;
+	  cx = 1.0 + theta2 * cx;
+	  *cosx = cx;
+	  cx = S3 + theta2 * S4;
+	  cx = S2 + theta2 * cx;
+	  cx = S1 + theta2 * cx;
+	  cx = S0 + theta2 * cx;
+	  cx = theta + theta * theta2 * cx;
+	  *sinx = cx;
+	}
+      else if (abstheta >= 0x1p-27)	/* |x| >= 2^-27.  */
+	{
+	  /* A simpler Chebyshev approximation is close enough for this range:
+	   * for sin: x+x^3*(SS0+x^2*SS1)
+	   * for cos: 1.0+x^2*(CC0+x^3*CC1).  */
+	  double theta2 = theta * theta;
+	  cx = CC0 + theta * theta2 * CC1;
+	  cx = 1.0 + theta2 * cx;
+	  *cosx = cx;
+	  cx = SS0 + theta2 * SS1;
+	  cx = theta + theta * theta2 * cx;
+	  *sinx = cx;
+	}
+      else
+	{
+	  /* Handle some special cases.  */
+	  if (theta)
+	    *sinx = theta - (theta * SMALL);
+	  else
+	    *sinx = theta;
+	  *cosx = 1.0 - abstheta;
+	}
+    }
+  else				/* |x| >= Pi/4.  */
+    {
+      unsigned long signbit = (x < 0);
+      if (abstheta < 9 * M_PI_4)	/* |x| < 9*Pi/4.  */
+	{
+	  unsigned long n = (abstheta * inv_PI_4) + 1;
+	  theta = abstheta - pio2_table[n / 2];
+	  reduced (theta, n, sinx, cosx, signbit);
+	}
+      else if (abstheta < INFINITY)
+	{
+	  if (abstheta < 8388608.0)	/* |x| < 2^23.  */
+	    {
+	      unsigned long n = floor (abstheta * inv_PI_4) + 1.0;
+	      double x = floor (n / 2.0);
+	      theta = x * PI_2_lo + (x * PI_2_hi + abstheta);
+	      /* Argument reduction needed.  */
+	      reduced (theta, n, sinx, cosx, signbit);
+	    }
+	  else			/* |x| >= 2^23.  */
+	    {
+	      x = fabs (x);
+	      int32_t exponent;
+	      GET_FLOAT_WORD (exponent, x);
+	      exponent =
+		(exponent >> FLOAT_EXPONENT_SHIFT) - FLOAT_EXPONENT_BIAS;
+	      exponent += 3;
+	      exponent = (exponent * (0x100000000 / 28 + 1)) >> 32;
+	      double a = invpio4_table[exponent] * x;
+	      double b = invpio4_table[exponent + 1] * x;
+	      double c = invpio4_table[exponent + 2] * x;
+	      double d = invpio4_table[exponent + 3] * x;
+	      unsigned long l = a;
+	      l &= ~0x7;
+	      a -= l;
+	      double e = a + b;
+	      l = e;
+	      e = a - l;
+	      if (l & 1)
+		{
+		  e -= 1.0;
+		  e += b;
+		  e += c;
+		  e += d;
+		  e *= M_PI_4;
+		  reduced (e, l + 1, sinx, cosx, signbit);
+		}
+	      else
+		{
+		  e += b;
+		  e += c;
+		  e += d;
+		  if (e <= 1.0)
+		    {
+		      e *= M_PI_4;
+		      reduced (e, l + 1, sinx, cosx, signbit);
+		    }
+		  else
+		    {
+		      l++;
+		      e -= 2.0;
+		      e *= M_PI_4;
+		      reduced (e, l + 1, sinx, cosx, signbit);
+		    }
+		}
+	    }
+	}
+      else
+	{
+	  int32_t ix;
+	  /* High word of x.  */
+	  GET_FLOAT_WORD (ix, abstheta);
+	  /* sin/cos(Inf or NaN) is NaN.  */
+	  *sinx = *cosx = x - x;
+	  if (ix == 0x7f800000)
+	    __set_errno (EDOM);
+	}
+    }
+}
+
+weak_alias (__sincosf, sincosf)