[5/8] math: Use cosf from CORE-MATH

Message ID 20260325192357.1284741-6-adhemerval.zanella@linaro.org (mailing list archive)
State Changes Requested
Headers
Series Add sinf/cosf/sincosf CORE-MATH implementations |

Commit Message

Adhemerval Zanella Netto March 25, 2026, 7:22 p.m. UTC
  The CORE-MATH implementation is correctly rounded (in any rounding mode)
and performs similarly:

* latency
- input: random (-2*pi, 2*pi)
latency        master        patched   improvement
x86_64        39.9757        48.0301       -20.15%
x86_64v2      38.7156        35.6435         7.94%
x86_64v3      35.3454        34.5288         2.31%
aarch64       14.2015        13.7825         2.95%
power10        9.1381         7.8624        13.96%

- input: large (-64, 64)
latency        master        patched   improvement
x86_64        41.8186        48.9696       -17.10%
x86_64v2      40.4777        38.0614         5.97%
x86_64v3      37.0052        37.4971        -1.33%
aarch64       14.7948        15.7752        -6.63%
power10        9.7240         8.9764         7.69%

- input: huge (-1024, 1024)
latency        master        patched   improvement
x86_64        45.3777        49.6575        -9.43%
x86_64v2      43.6480        39.6503         9.16%
x86_64v3      40.9541        37.5966         8.20%
aarch64       17.9758        14.9358        16.91%
power10       10.6854         8.1665        23.57%

* reciprocal-throughput
- input: random (-2*pi, 2*pi)
               master        patched   improvement
x86_64        11.8433        17.9805       -51.82%
x86_64v2      12.6771        10.9479        13.64%
x86_64v3      11.1440        10.9951         1.34%
aarch64        7.5800        6.8004         10.29%
power10        4.4510         3.5792        19.59%

- input: rlarge (-64, 64)
               master        patched    improvement
x86_64        11.9478        18.5297       -55.09%
x86_64v2      12.7393        12.8142        -0.59%
x86_64v3      11.1613        12.7013       -13.80%
aarch64        8.3857        8.1288          3.06%
power10        5.2212         4.3630        16.44%

- input: huge (-1024, 1024)
               master        patched   improvement
x86_64        15.2741        18.3418       -20.08%
x86_64v2      15.1064        13.0792        13.42%
x86_64v3      14.6651        12.6774        13.55%
aarch64        9.6304        7.2278         24.95%
power10        5.0207         3.6809        26.69%

The benchmarks were run on x64_64 (Ryzen 9 5900X, gcc 15.2.1)
with --disable-multi-arch disable; aarch64 (Neoverse-N1, gcc 15.2.1),
and powerpc (POWER10, gcc 15.2.1).

The code is adapted to glibc style and to use the math_config
definition.h (to handle errno, overflow, and underflow).

The x86_64-v1 shows some performance regression because the CORE-MATH
implementation relies on roundeven. The x86_64v2 and forward provide
a specific instruction, and x86_64 already provides an FMA ifunc variant.

Checked on x86_64-linux-gnu, i686-linux-gnu, aarch64-linux-gnu,
and powerpc64le-linux-gnu.
---
 SHARED-FILES                                  |   2 +
 sysdeps/ieee754/flt-32/libm-test-ulps         |  12 +
 sysdeps/ieee754/flt-32/s_cosf.c               | 220 +++++++++++++-----
 sysdeps/ieee754/flt-32/s_sincosf_data.h       |  10 +-
 .../ieee754/flt-32/s_sincosf_data_generic.c   |  35 ++-
 5 files changed, 205 insertions(+), 74 deletions(-)
  

Patch

diff --git a/SHARED-FILES b/SHARED-FILES
index d4b5b30ae4..751b31fae4 100644
--- a/SHARED-FILES
+++ b/SHARED-FILES
@@ -288,6 +288,8 @@  core-math:
   sysdeps/ieee754/flt-32/s_atanpif.c
   # src/binary32/cbrt/cbrtf.c, revision f7c7408d
   sysdeps/ieee754/flt-32/s_cbrtf.c
+  # src/binary32/cos/cosf.c revision 8ea8ea35
+  sysdeps/ieee754/flt-32/s_cosf.c
   # src/binary32/cospi/cospif.c, revision bbfabd99
   sysdeps/ieee754/flt-32/s_cospif.c
   # src/binary32/erfc/erfcf.c revision d0a2be20
diff --git a/sysdeps/ieee754/flt-32/libm-test-ulps b/sysdeps/ieee754/flt-32/libm-test-ulps
index 2516f54a55..0179223018 100644
--- a/sysdeps/ieee754/flt-32/libm-test-ulps
+++ b/sysdeps/ieee754/flt-32/libm-test-ulps
@@ -95,6 +95,18 @@  float: 0
 Function: "cbrt_upward":
 float: 0
 
+Function: "cos":
+float: 0
+
+Function: "cos_downward":
+float: 0
+
+Function: "cos_towardzero":
+float: 0
+
+Function: "cos_upward":
+float: 0
+
 Function: "cosh":
 float: 0
 
diff --git a/sysdeps/ieee754/flt-32/s_cosf.c b/sysdeps/ieee754/flt-32/s_cosf.c
index be98f8bb3b..2bef3b3036 100644
--- a/sysdeps/ieee754/flt-32/s_cosf.c
+++ b/sysdeps/ieee754/flt-32/s_cosf.c
@@ -1,90 +1,182 @@ 
-/* Compute cosine of argument.
-   Copyright (C) 2018-2026 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
+/* Correctly-rounded cosine of binary32 value.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+Copyright (c) 2022-2026 Alexei Sibidanov.
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+The original version of this file was copied from the CORE-MATH
+project (file src/binary32/cosh/coshf.c, revision 8ea8ea35.
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+Permission is hereby granted, free of charge, to any person obtaining A copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include <array_length.h>
 #include <stdint.h>
 #include <math.h>
 #include <math-barriers.h>
 #include <libm-alias-float.h>
 #include "math_config.h"
-#include "s_sincosf.h"
+#include <math_uint128.h>
+#include <s_sincosf_data.h>
+
+static double __attribute__ ((noinline))
+rbig (uint32_t u, int *q)
+{
+  int e = (u >> 23) & 0xff, i;
+  uint64_t m = (u & (~0u >> 9)) | 1 << 23;
+  u128 p0 = u128_mul (u128_from_u64 (m), u128_from_u64 (IPI[0]));
+  u128 p1 = u128_mul (u128_from_u64 (m), u128_from_u64 (IPI[1]));
+  p1 = u128_add (p1, u128_rshift (p0, 64));
+  u128 p2 = u128_mul (u128_from_u64 (m), u128_from_u64 (IPI[2]));
+  p2 = u128_add (p2, u128_rshift (p1, 64));
+  u128 p3 = u128_mul (u128_from_u64 (m), u128_from_u64 (IPI[3]));
+  p3 = u128_add (p3, u128_rshift (p2, 64));
+  uint64_t p3h = u128_high (p3), p3l = u128_low (p3), p2l = u128_low (p2),
+	   p1l = u128_low (p1);
+  int64_t A;
+  int k = e - 124, s = k - 23;
+  /* in cr_sinf(), rbig() is called in the case 127+28 <= e < 0xff
+     thus 155 <= e <= 254, which yields 28 <= k <= 127 and 5 <= s <= 104 */
+  if (s < 64)
+    {
+      i = p3h << s | p3l >> (64 - s);
+      A = p3l << s | p2l >> (64 - s);
+    }
+  else if (s == 64)
+    {
+      i = p3l;
+      A = p2l;
+    }
+  else
+    { /* s > 64 */
+      i = p3l << (s - 64) | p2l >> (128 - s);
+      A = p2l << (s - 64) | p1l >> (128 - s);
+    }
+  int sgn = u;
+  sgn >>= 31;
+  int64_t sm = A >> 63;
+  i -= sm;
+  double z = (A ^ sgn) * 0x1p-64;
+  i = (i ^ sgn) - sgn;
+  *q = i;
+  return z;
+}
+
+static inline double
+rltl (float z, int *q)
+{
+  double x = z;
+  double idl = -0x1.b1bbead603d8bp-29 * x, idh = 0x1.45f306ep+2 * x,
+	 id = roundeven_finite (idh);
+  *q = asuint64 (0x1.8p52 + id);
+  return (idh - id) + idl;
+}
+
+static inline double
+rltl0 (double x, int *q)
+{
+  double idh = 0x1.45f306dc9c883p+2 * x, id = roundeven_finite (idh);
+  *q = asuint64 (0x1.8p52 + id);
+  return idh - id;
+}
+
+static float __attribute__ ((noinline))
+as_cosf_database (float x, double r)
+{
+  uint32_t t = asuint (x);
+  uint32_t ax = t & (~0u >> 1);
+  for (unsigned i = 0; i < array_length (ST_COSF); i++)
+    if (__glibc_unlikely (ST_COSF[i].uarg == ax))
+      return ST_COSF[i].rh + ST_COSF[i].rl;
+  return r;
+}
+
+static float __attribute__ ((noinline))
+as_cosf_big (float x)
+{
+  uint32_t t = asuint (x);
+  uint32_t ax = t << 1;
+  if (__glibc_unlikely (ax >= 0xffu << 24))
+    { // nan or +-inf
+      if (ax << 8)
+	return x + x; // nan
+      return __math_invalidf (x);
+    }
+  int ia;
+  double z = rbig (t, &ia);
+  double z2 = z * z, z4 = z2 * z2;
+  double aa = (A[0] + z2 * A[1]) + z4 * (A[2] + z2 * A[3]);
+  double bb = (B[0] + z2 * B[1]) + z4 * (B[2] + z2 * B[3]);
+  double s0 = TB_COSF[(ia + 8u) & 31], c0 = TB_COSF[ia & 31];
+  double r = c0 + z * (aa * s0 - bb * (z * c0));
+  uint32_t tr = asuint64 (r);
+  uint64_t tail = (tr + 6) & (~UINT64_C(0) >> 36);
+  if (__glibc_unlikely (tail <= 12))
+    return as_cosf_database (x, r);
+  return r;
+}
 
 #ifndef SECTION
-# define SECTION
+#  define SECTION
 #endif
 
 #ifndef COSF
-# define COSF_FUNC __cosf
+#  define COSF_FUNC __cosf
 #else
-# define COSF_FUNC COSF
+#  define COSF_FUNC COSF
 #endif
 
-/* Fast cosf implementation.  Worst-case ULP is 0.5607, maximum relative
-   error is 0.5303 * 2^-23.  A single-step range reduction is used for
-   small values.  Large inputs have their range reduced using fast integer
-   arithmetic.
-*/
-float
-SECTION
-COSF_FUNC (float y)
+float SECTION
+COSF_FUNC (float x)
 {
-  double x = y;
-  double s;
-  int n;
-  const sincos_t *p = &__sincosf_table[0];
-
-  if (abstop12 (y) < abstop12 (pio4))
+  uint32_t t = asuint (x);
+  uint32_t ax = t << 1;
+  int ia;
+  double z0 = x, z;
+  if (__glibc_unlikely (ax > 0x99000000u || ax < 0x73000000))
     {
-      double x2 = x * x;
-
-      if (__glibc_unlikely (abstop12 (y) < abstop12 (0x1p-12f)))
-	return 1.0f;
-
-      return sinf_poly (x, x2, p, 1);
+      if (__glibc_likely (ax < 0x73000000))
+	{
+	  if (__glibc_unlikely (ax < 0x66000000u))
+	    {
+	      if (__glibc_unlikely (ax == 0u))
+		return 1.0f;
+	      return 1.0f - 0x1p-25f;
+	    }
+	  return -0x1p-1f * x * x + 1.0f;
+	}
+      return as_cosf_big (x);
     }
-  else if (__glibc_likely (abstop12 (y) < abstop12 (120.0f)))
+  if (__glibc_likely (ax < 0x82a41896u))
     {
-      x = reduce_fast (x, p, &n);
-
-      /* Setup the signs for sin and cos.  */
-      s = p->sign[n & 3];
-
-      if (n & 2)
-	p = &__sincosf_table[1];
-
-      return sinf_poly (x * s, x * x, p, n ^ 1);
-    }
-  else if (abstop12 (y) < abstop12 (INFINITY))
-    {
-      uint32_t xi = asuint (y);
-      int sign = xi >> 31;
-
-      x = reduce_large (xi, &n);
-
-      /* Setup signs for sin and cos - include original sign.  */
-      s = p->sign[(n + sign) & 3];
-
-      if ((n + sign) & 2)
-	p = &__sincosf_table[1];
-
-      return sinf_poly (x * s, x * x, p, n ^ 1);
+      if (__glibc_unlikely (ax == 0x812d97c8u))
+	return as_cosf_database (x, 0.0);
+      z = rltl0 (z0, &ia);
     }
   else
-    return __math_invalidf (y);
+    {
+      z = rltl (z0, &ia);
+    }
+  double z2 = z * z, z4 = z2 * z2;
+  double aa = (A[0] + z2 * A[1]) + z4 * (A[2] + z2 * A[3]);
+  double bb = (B[0] + z2 * B[1]) + z4 * (B[2] + z2 * B[3]);
+  double c0 = TB_COSF[ia & 31], s0 = TB_COSF[(ia + 8) & 31];
+  double r = c0 + aa * (z * s0) - bb * (z2 * c0);
+  return r;
 }
 
 #ifndef COSF
diff --git a/sysdeps/ieee754/flt-32/s_sincosf_data.h b/sysdeps/ieee754/flt-32/s_sincosf_data.h
index 02465ee931..8bea0042f3 100644
--- a/sysdeps/ieee754/flt-32/s_sincosf_data.h
+++ b/sysdeps/ieee754/flt-32/s_sincosf_data.h
@@ -29,7 +29,10 @@  extern const double __sinf_a[] attribute_hidden;
 extern const double __sinf_tb[] attribute_hidden;
 #define TB __sinf_tb
 
-extern const struct
+extern const double __cosf_tb[] attribute_hidden;
+#define TB_COSF __cosf_tb
+
+typedef struct
 {
   union
   {
@@ -37,7 +40,10 @@  extern const struct
     uint32_t uarg;
   };
   float rh, rl;
-} __sinf_st[4] attribute_hidden;
+} sincosf_database_t;
+extern const sincosf_database_t __sinf_st[4] attribute_hidden;
 #define ST __sinf_st
+extern const sincosf_database_t __cosf_st[5] attribute_hidden;
+#define ST_COSF __cosf_st
 
 #endif
diff --git a/sysdeps/ieee754/flt-32/s_sincosf_data_generic.c b/sysdeps/ieee754/flt-32/s_sincosf_data_generic.c
index f53251d3d8..2e58f0b423 100644
--- a/sysdeps/ieee754/flt-32/s_sincosf_data_generic.c
+++ b/sysdeps/ieee754/flt-32/s_sincosf_data_generic.c
@@ -24,6 +24,8 @@  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
+#include <s_sincosf_data.h>
+
 const uint64_t __sinf_ipi[] =
 {
   0xfe5163abdebbc562, 0xdb6295993c439041, 0xfc2757d1f534ddc0,
@@ -55,17 +57,34 @@  const double __sinf_tb[] =
   -0x1.87de2a6aea963p-2, -0x1.8f8b83c69a60bp-3
 };
 
-const struct
+const double __cosf_tb[] =
+{
+   0x1p+0,                0x1.f6297cff75cbp-1,   0x1.d906bcf328d46p-1,
+   0x1.a9b66290ea1a3p-1,  0x1.6a09e667f3bcdp-1,  0x1.1c73b39ae68c8p-1,
+   0x1.87de2a6aea963p-2,  0x1.8f8b83c69a60bp-3,  0x0p+0,
+  -0x1.8f8b83c69a60bp-3, -0x1.87de2a6aea963p-2, -0x1.1c73b39ae68c8p-1,
+  -0x1.6a09e667f3bcdp-1, -0x1.a9b66290ea1a3p-1, -0x1.d906bcf328d46p-1,
+  -0x1.f6297cff75cbp-1,  -0x1p+0,               -0x1.f6297cff75cbp-1,
+  -0x1.d906bcf328d46p-1, -0x1.a9b66290ea1a3p-1, -0x1.6a09e667f3bcdp-1,
+  -0x1.1c73b39ae68c8p-1, -0x1.87de2a6aea963p-2, -0x1.8f8b83c69a60bp-3,
+   0x0p+0,                0x1.8f8b83c69a60bp-3,  0x1.87de2a6aea963p-2,
+   0x1.1c73b39ae68c8p-1,  0x1.6a09e667f3bcdp-1,  0x1.a9b66290ea1a3p-1,
+   0x1.d906bcf328d46p-1,  0x1.f6297cff75cbp-1
+};
+
+const sincosf_database_t __sinf_st[] =
 {
-  union
-  {
-    float arg;
-    uint32_t uarg;
-  };
-  float rh, rl;
-} __sinf_st[] = {
   { { 0x1.33333p+13 }, -0x1.63f4bap-2, -0x1p-27 },
   { { 0x1.75b8a2p-1 }, 0x1.55688ap-1, -0x1p-26 },
   { { 0x1.4f0654p+0 }, 0x1.ee836cp-1, -0x1p-26 },
   { { 0x1.2d97c8p+3 }, -0x1.99bc5ap-26, -0x1p-51 },
 };
+
+const sincosf_database_t __cosf_st[] =
+{
+  { { 0x1.2d97c8p+2 }, 0x1.99bc5cp-27, -0x1p-52 },
+  { { 0x1.4555p+51 }, 0x1.115d7ep-1, -0x1p-26 },
+  { { 0x1.48a858p+54 }, 0x1.f48148p-2, 0x1p-27 },
+  { { 0x1.3170fp+63 }, 0x1.fe2976p-1, 0x1p-26 },
+  { { 0x1.2b9622p+67 }, 0x1.f0285ep-1, -0x1p-26 },
+};