[4/8] math: Use sinf from CORE-MATH

Message ID 20260325192357.1284741-5-adhemerval.zanella@linaro.org (mailing list archive)
State Changes Requested
Headers
Series Add sinf/cosf/sincosf CORE-MATH implementations |

Commit Message

Adhemerval Zanella Netto March 25, 2026, 7:22 p.m. UTC
  The CORE-MATH implementation is correctly rounded (in any rounding mode)
and performs similarly.

* latency
- input: random (-2*pi, 2*pi)
               master        patched    improvement
x86_64        39.8085        49.1343        -23.43%
x86_64v2      40.0526        35.4431         11.51%
x86_64v3      35.3150        35.4092         -0.27%
aarch64       14.1862        13.2657          6.49%
power10        9.2147         7.9437         13.79%

- input: large (-64, 64)
               master        patched    improvement
x86_64        41.8170        50.5771        -20.95%
x86_64v2      42.0707        37.7077         10.37%
x86_64v3      37.1012        37.6129         -1.38%
aarch64       14.9308        15.7713         -5.63%
power10        9.7280         8.7833          9.71%

-input: huge (-1024, 1024)
               master        patched    improvement
x86_64        45.4243        50.4141        -10.98%
x86_64v2      45.6948        38.2110         16.38%
x86_64v3      40.9413        37.3504          8.77%
aarch64       17.8339        14.9379         16.24%
power10       10.8863         8.1787         24.87%

* reciprocal-throughput
- input: random (-2*pi, 2*pi)
               master        patched    improvement
x86_64        11.9412        18.2488        -52.82%
x86_64v2      12.0076        10.9426          8.87%
x86_64v3      11.4248        10.4730          8.33%
aarch64        7.6716         6.9755          9.07%
power10        4.5249         3.6755         18.77%

- input: large (-64, 64)
               master        patched    improvement
x86_64        11.8827        20.4191        -71.84%
x86_64v2      11.8933        12.5825         -5.79%
x86_64v3      11.3743        12.8875        -13.30%
aarch64        8.4026         8.1294          3.25%
power10        5.1247         4.2635         16.81%

- input :huge (-1024, 1024)
               master        patched    improvement
x86_64        15.2653        20.3141        -33.07%
x86_64v2      15.3004        12.5485         17.99%
x86_64v3      13.9678        13.0890         6.29%
aarch64        9.6422         7.2273        25.04%
power10        5.6171         3.7890        32.54%

The benchmarks were run on x64_64 (Ryzen 9 5900X, gcc 15.2.1)
with --disable-multi-arch disable; aarch64 (Neoverse-N1, gcc 15.2.1),
and powerpc (POWER10, gcc 15.2.1).

The code is adapted to glibc style and to use the math_config
definition.h (to handle errno, overflow, and underflow).

The x86_64-v1 shows some performance regression because the CORE-MATH
implementation relies on roundeven. The x86_64v2 and forward provide
a specific instruction, and x86_64 already provides an FMA ifunc variant.

Checked on x86_64-linux-gnu, i686-linux-gnu, aarch64-linux-gnu,
and powerpc64le-linux-gnu.
---
 SHARED-FILES                                  |   2 +
 sysdeps/ieee754/flt-32/libm-test-ulps         |  12 +
 sysdeps/ieee754/flt-32/s_sincosf_data.c       |   2 +
 sysdeps/ieee754/flt-32/s_sincosf_data.h       |  43 ++++
 .../ieee754/flt-32/s_sincosf_data_generic.c   |  71 ++++++
 sysdeps/ieee754/flt-32/s_sinf.c               | 235 +++++++++++++-----
 sysdeps/x86/fpu/s_sincosf_data.c              |   2 +
 7 files changed, 300 insertions(+), 67 deletions(-)
 create mode 100644 sysdeps/ieee754/flt-32/s_sincosf_data.h
 create mode 100644 sysdeps/ieee754/flt-32/s_sincosf_data_generic.c
  

Patch

diff --git a/SHARED-FILES b/SHARED-FILES
index 5a676e1a48..d4b5b30ae4 100644
--- a/SHARED-FILES
+++ b/SHARED-FILES
@@ -306,6 +306,8 @@  core-math:
   sysdeps/ieee754/flt-32/s_log1pf.c
   # src/binary32/log2p1/log2p1f.c revision 3fbe16be
   sysdeps/ieee754/flt-32/s_log2p1f.c
+  # src/binary32/sin/sinf.c revision 8ea8ea35
+  sysdeps/ieee754/flt-32/s_sinf.c
   # src/binary32/sinpi/sinpif.c, revision bbfabd99d
   sysdeps/ieee754/flt-32/s_sinpif.c
   # src/binary32/tan/tanf.c, revision 59d21d7
diff --git a/sysdeps/ieee754/flt-32/libm-test-ulps b/sysdeps/ieee754/flt-32/libm-test-ulps
index 15a7248d8b..2516f54a55 100644
--- a/sysdeps/ieee754/flt-32/libm-test-ulps
+++ b/sysdeps/ieee754/flt-32/libm-test-ulps
@@ -251,6 +251,18 @@  float: 0
 Function: "log2p1_upward":
 float: 0
 
+Function: "sin":
+float: 0
+
+Function: "sin_downward":
+float: 0
+
+Function: "sin_towardzero":
+float: 0
+
+Function: "sin_upward":
+float: 0
+
 Function: "sinh":
 float: 0
 
diff --git a/sysdeps/ieee754/flt-32/s_sincosf_data.c b/sysdeps/ieee754/flt-32/s_sincosf_data.c
index 2ece6db708..8c914a8bf6 100644
--- a/sysdeps/ieee754/flt-32/s_sincosf_data.c
+++ b/sysdeps/ieee754/flt-32/s_sincosf_data.c
@@ -72,3 +72,5 @@  const uint32_t __inv_pio4[24] =
   0x34ddc0db, 0xddc0db62, 0xc0db6295, 0xdb629599,
   0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041
 };
+
+#include <sysdeps/ieee754/flt-32/s_sincosf_data_generic.c>
diff --git a/sysdeps/ieee754/flt-32/s_sincosf_data.h b/sysdeps/ieee754/flt-32/s_sincosf_data.h
new file mode 100644
index 0000000000..02465ee931
--- /dev/null
+++ b/sysdeps/ieee754/flt-32/s_sincosf_data.h
@@ -0,0 +1,43 @@ 
+/* Compute sine and cosine of argument.
+   Copyright (C) 2018-2026 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _S_SINCOSF_DATA_H
+#define _S_SINCOSF_DATA_H
+
+extern const uint64_t __sinf_ipi[] attribute_hidden;
+#define IPI __sinf_ipi
+
+extern const double __sinf_b[] attribute_hidden;
+#define B __sinf_b
+extern const double __sinf_a[] attribute_hidden;
+#define A __sinf_a
+extern const double __sinf_tb[] attribute_hidden;
+#define TB __sinf_tb
+
+extern const struct
+{
+  union
+  {
+    float arg;
+    uint32_t uarg;
+  };
+  float rh, rl;
+} __sinf_st[4] attribute_hidden;
+#define ST __sinf_st
+
+#endif
diff --git a/sysdeps/ieee754/flt-32/s_sincosf_data_generic.c b/sysdeps/ieee754/flt-32/s_sincosf_data_generic.c
new file mode 100644
index 0000000000..f53251d3d8
--- /dev/null
+++ b/sysdeps/ieee754/flt-32/s_sincosf_data_generic.c
@@ -0,0 +1,71 @@ 
+/* Correctly-rounded sine of binary32 value.
+ 
+Copyright (c) 2022-2026 Alexei Sibidanov.
+ 
+The original version of this file was copied from the CORE-MATH
+project (file src/binary32/cosh/coshf.c, revision 8ea8ea35.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+const uint64_t __sinf_ipi[] =
+{
+  0xfe5163abdebbc562, 0xdb6295993c439041, 0xfc2757d1f534ddc0,
+  0xa2f9836e4e441529
+};
+
+const double __sinf_b[] =
+{
+   0x1.3bd3cc9be45dcp-6, -0x1.03c1f081b0833p-14,  0x1.55d3c6fc9ac1fp-24,
+  -0x1.e1d3ff281b40dp-35
+};
+const double __sinf_a[] =
+{
+   0x1.921fb54442d17p-3, -0x1.4abbce6256a39p-10,   0x1.466bc5a518c16p-19,
+  -0x1.32bdc61074ff6p-29
+};
+const double __sinf_tb[] =
+{
+   0x0p+0,                0x1.8f8b83c69a60bp-3,  0x1.87de2a6aea963p-2,
+   0x1.1c73b39ae68c8p-1,  0x1.6a09e667f3bcdp-1,  0x1.a9b66290ea1a3p-1,
+   0x1.d906bcf328d46p-1,  0x1.f6297cff75cbp-1,   0x1p+0,
+   0x1.f6297cff75cbp-1,   0x1.d906bcf328d46p-1,  0x1.a9b66290ea1a3p-1,
+   0x1.6a09e667f3bcdp-1,  0x1.1c73b39ae68c8p-1,  0x1.87de2a6aea963p-2,
+   0x1.8f8b83c69a60bp-3,  0x0p+0,               -0x1.8f8b83c69a60bp-3,
+  -0x1.87de2a6aea963p-2, -0x1.1c73b39ae68c8p-1, -0x1.6a09e667f3bcdp-1,
+  -0x1.a9b66290ea1a3p-1, -0x1.d906bcf328d46p-1, -0x1.f6297cff75cbp-1,
+  -0x1p+0,               -0x1.f6297cff75cbp-1,  -0x1.d906bcf328d46p-1,
+  -0x1.a9b66290ea1a3p-1, -0x1.6a09e667f3bcdp-1, -0x1.1c73b39ae68c8p-1,
+  -0x1.87de2a6aea963p-2, -0x1.8f8b83c69a60bp-3
+};
+
+const struct
+{
+  union
+  {
+    float arg;
+    uint32_t uarg;
+  };
+  float rh, rl;
+} __sinf_st[] = {
+  { { 0x1.33333p+13 }, -0x1.63f4bap-2, -0x1p-27 },
+  { { 0x1.75b8a2p-1 }, 0x1.55688ap-1, -0x1p-26 },
+  { { 0x1.4f0654p+0 }, 0x1.ee836cp-1, -0x1p-26 },
+  { { 0x1.2d97c8p+3 }, -0x1.99bc5ap-26, -0x1p-51 },
+};
diff --git a/sysdeps/ieee754/flt-32/s_sinf.c b/sysdeps/ieee754/flt-32/s_sinf.c
index 98845a3084..45d8a62a44 100644
--- a/sysdeps/ieee754/flt-32/s_sinf.c
+++ b/sysdeps/ieee754/flt-32/s_sinf.c
@@ -1,27 +1,37 @@ 
-/* Compute sine of argument.
-   Copyright (C) 2018-2026 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
+/* Correctly-rounded sine of binary32 value.
+ 
+Copyright (c) 2022-2026 Alexei Sibidanov.
+ 
+The original version of this file was copied from the CORE-MATH
+project (file src/binary32/cosh/coshf.c, revision 8ea8ea35.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+ 
+#include <array_length.h>
 #include <stdint.h>
 #include <math.h>
 #include <math-barriers.h>
 #include <libm-alias-float.h>
 #include "math_config.h"
-#include "s_sincosf.h"
+#include <math_uint128.h>
+#include <s_sincosf_data.h>
 
 #ifndef SECTION
 # define SECTION
@@ -33,63 +43,154 @@ 
 # define SINF_FUNC SINF
 #endif
 
-/* Fast sinf implementation.  Worst-case ULP is 0.5607, maximum relative
-   error is 0.5303 * 2^-23.  A single-step range reduction is used for
-   small values.  Large inputs have their range reduced using fast integer
-   arithmetic.
-*/
-float
-SECTION
-SINF_FUNC (float y)
+static double __attribute__ ((noinline))
+rbig (uint32_t u, int *q)
 {
-  double x = y;
-  double s;
-  int n;
-  const sincos_t *p = &__sincosf_table[0];
-
-  if (abstop12 (y) < abstop12 (pio4))
+  int e = (u >> 23) & 0xff, i;
+  uint64_t m = (u & (~0u >> 9)) | 1 << 23;
+  u128 p0 = u128_mul (u128_from_u64 (m), u128_from_u64 (IPI[0]));
+  u128 p1 = u128_mul (u128_from_u64 (m), u128_from_u64 (IPI[1]));
+  p1 = u128_add (p1, u128_rshift (p0, 64));
+  u128 p2 = u128_mul (u128_from_u64 (m), u128_from_u64 (IPI[2]));
+  p2 = u128_add (p2, u128_rshift (p1, 64));
+  u128 p3 = u128_mul (u128_from_u64 (m), u128_from_u64 (IPI[3]));
+  p3 = u128_add (p3, u128_rshift (p2, 64));
+  uint64_t p3h = u128_high (p3), p3l = u128_low (p3), p2l = u128_low (p2),
+	   p1l = u128_low (p1);
+  int64_t a;
+  int k = e - 124, s = k - 23;
+  /* in cr_sinf(), rbig() is called in the case 127+28 <= e < 0xff
+     thus 155 <= e <= 254, which yields 28 <= k <= 127 and 5 <= s <= 104 */
+  if (s < 64)
     {
-      s = x * x;
-
-      if (__glibc_unlikely (abstop12 (y) < abstop12 (0x1p-12f)))
-      {
-	/* Force underflow for tiny y.  */
-	if (__glibc_unlikely (abstop12 (y) < abstop12 (0x1p-126f)))
-	  math_force_eval ((float)s);
-	return y;
-      }
-
-      return sinf_poly (x, s, p, 0);
+      i = p3h << s | p3l >> (64 - s);
+      a = p3l << s | p2l >> (64 - s);
     }
-  else if (__glibc_likely (abstop12 (y) < abstop12 (120.0f)))
+  else if (s == 64)
     {
-      x = reduce_fast (x, p, &n);
-
-      /* Setup the signs for sin and cos.  */
-      s = p->sign[n & 3];
-
-      if (n & 2)
-	p = &__sincosf_table[1];
-
-      return sinf_poly (x * s, x * x, p, n);
-    }
-  else if (abstop12 (y) < abstop12 (INFINITY))
-    {
-      uint32_t xi = asuint (y);
-      int sign = xi >> 31;
-
-      x = reduce_large (xi, &n);
-
-      /* Setup signs for sin and cos - include original sign.  */
-      s = p->sign[(n + sign) & 3];
-
-      if ((n + sign) & 2)
-	p = &__sincosf_table[1];
-
-      return sinf_poly (x * s, x * x, p, n);
+      i = p3l;
+      a = p2l;
     }
   else
-    return __math_invalidf (y);
+    { /* s > 64 */
+      i = p3l << (s - 64) | p2l >> (128 - s);
+      a = p2l << (s - 64) | p1l >> (128 - s);
+    }
+  int sgn = u;
+  sgn >>= 31;
+  int64_t sm = a >> 63;
+  i -= sm;
+  double z = (a ^ sgn) * 0x1p-64;
+  i = (i ^ sgn) - sgn;
+  *q = i;
+  return z;
+}
+
+static inline double
+rltl (float z, int *q)
+{
+  double x = z;
+  double idl = -0x1.b1bbead603d8bp-29 * x, idh = 0x1.45f306ep+2 * x,
+	 id = roundeven_finite (idh);
+  *q = asuint64 (0x1.8p52 + id);
+  return (idh - id) + idl;
+}
+
+static inline double
+rltl0 (double x, int *q)
+{
+  double idh = 0x1.45f306dc9c883p+2 * x, id = roundeven_finite (idh);
+  *q = asuint64 (0x1.8p52 + id);
+  return idh - id;
+}
+
+static inline float
+add_sign (float x, float rh, float rl)
+{
+  float sgn = copysignf (1.0f, x);
+  return sgn * rh + sgn * rl;
+}
+
+static float __attribute__ ((noinline))
+as_sinf_database (float x, double r)
+{
+  uint32_t ax = asuint (x) & (~0u >> 1);
+  for (unsigned i = 0; i < array_length (ST); i++)
+    if (__glibc_unlikely (ST[i].uarg == ax))
+      return add_sign (x, ST[i].rh, ST[i].rl);
+  return r;
+}
+
+static float __attribute__ ((noinline))
+as_sinf_big (float x)
+{
+  uint32_t t = asuint (x);
+  uint32_t ax = t << 1;
+  if (__glibc_unlikely (ax >= 0xffu << 24))
+    { // nan or +-inf
+      if (ax << 8)
+	return x + x; // nan
+      return __math_invalidf (x);
+    }
+  int ia;
+  double z = rbig (t, &ia);
+  double z2 = z * z, z4 = z2 * z2;
+  double aa = (A[0] + z2 * A[1]) + z4 * (A[2] + z2 * A[3]);
+  double bb = (B[0] + z2 * B[1]) + z4 * (B[2] + z2 * B[3]);
+  double s0 = TB[ia & 31], c0 = TB[(ia + 8u) & 31];
+  double r = s0 + z * (aa * c0 - bb * (z * s0));
+  return r;
+}
+
+float SECTION
+SINF_FUNC (float x)
+{
+  uint32_t ax = asuint (x) << 1;
+  int ia;
+  double z0 = x, z;
+  if (__glibc_unlikely (ax > 0x99000000u || ax < 0x73000000u))
+    {
+      // |x| > 0x1p+26 or |x| < 0x1p-12
+      if (__glibc_likely (ax < 0x73000000u))
+	{ // |x| < 0x1p-12
+	  if (__glibc_unlikely (ax < 0x66000000u))
+	    { // |x| < 0x1p-25
+	      if (__glibc_unlikely (ax == 0u))
+		return x;
+	      float res = fmaf (-x, fabsf (x), x);
+	      /* The Taylor expansion of sin(x) at x=0 is x - x^3/6 + o(x^3).
+		 For |x| > 2^-126 we have no underflow, whatever the rounding
+		 mode. For |x| < 2^-126, since |sin(x)| < |x|, we always have
+		 underflow. For |x| = 2^-126, we have underflow for rounding
+		 towards zero, i.e., when sin(x) rounds to nextbelow(2^-126).
+		 In summary, we have underflow whenever |x|<2^-126 or
+		 |res|<2^-126. */
+	      if (fabsf (x) < 0x1p-126f || fabsf (res) < 0x1p-126f)
+		return __math_erangef (res); // underflow
+	      return res;
+	    }
+	  return (-0x1.555556p-3f * x) * (x * x) + x;
+	}
+      return as_sinf_big (x);
+    }
+  if (__glibc_likely (ax < 0x822d97c8u))
+    {
+      if (__glibc_unlikely (ax == 0x7e75b8a2u || ax == 0x7f4f0654u))
+	return as_sinf_database (x, 0.0);
+      z = rltl0 (z0, &ia);
+    }
+  else
+    {
+      if (__glibc_unlikely (ax == 0x8c333330u))
+	return as_sinf_database (x, 0.0);
+      z = rltl (z0, &ia);
+    }
+  double z2 = z * z, z4 = z2 * z2;
+  double aa = (A[0] + z2 * A[1]) + z4 * (A[2] + z2 * A[3]);
+  double bb = (B[0] + z2 * B[1]) + z4 * (B[2] + z2 * B[3]);
+  double s0 = TB[ia & 31], c0 = TB[(ia + 8) & 31];
+  double r = s0 + aa * (z * c0) - bb * (z2 * s0);
+  return r;
 }
 
 #ifndef SINF
diff --git a/sysdeps/x86/fpu/s_sincosf_data.c b/sysdeps/x86/fpu/s_sincosf_data.c
index 2a62c7a4eb..8e96ecb86e 100644
--- a/sysdeps/x86/fpu/s_sincosf_data.c
+++ b/sysdeps/x86/fpu/s_sincosf_data.c
@@ -66,3 +66,5 @@  const uint32_t __inv_pio4[24] =
   0x34ddc0db, 0xddc0db62, 0xc0db6295, 0xdb629599,
   0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041
 };
+
+#include <sysdeps/ieee754/flt-32/s_sincosf_data_generic.c>