[v2] math: Add new exp10 implementation

Message ID 20231201094945.49035-1-Joe.Ramsay@arm.com
State Committed
Commit 63d0a35d5f223a3f4b68190567b7d4d44545bce5
Headers
Series [v2] math: Add new exp10 implementation |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed

Commit Message

Joe Ramsay Dec. 1, 2023, 9:49 a.m. UTC
  New implementation is based on the existing exp/exp2, with different
reduction constants and polynomial. Worst-case error in round-to-
nearest is 0.513 ULP.

The exp/exp2 shared table is reused for exp10 - .rodata size of
e_exp_data increases by 64 bytes.

As for exp/exp2, targets with single-instruction rounding/conversion
intrinsics can use them by toggling TOINT_INTRINSICS=1 and adding the
necessary code to their math_private.h.

Improvements on Neoverse V1 compared to current GLIBC master:
exp10 thruput: 3.3x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
exp10 latency: 1.8x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]

Tested on:
    aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and
    x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction)
---
Differences from v1:
* Stop preventing inlining of special-case - no good reason for this,
  in fact performance is slightly better if it is inlined
* Remove configurable wide poly
* Update max ULP based on several runs of the subdivide program with
  threshold=1000000
* Define float constant in hex format
* No benchtest added, as there is already one for exp10 (I couldn't
  see a way to have multiple intervals in exp10-inputs?). Anyway the
  intervals in the previous version's commit message were chosen
  fairly arbitrarily - added speedup measurement for the interval used
  in exp10-inputs
Thanks,
Joe
 sysdeps/ieee754/dbl-64/e_exp10.c     | 144 ++++++++++++++++++++++-----
 sysdeps/ieee754/dbl-64/e_exp_data.c  |  11 ++
 sysdeps/ieee754/dbl-64/math_config.h |   4 +
 3 files changed, 135 insertions(+), 24 deletions(-)
  

Comments

Paul Zimmermann Dec. 1, 2023, 10:51 a.m. UTC | #1
Hi Joe,

looks good to me (precision-wise). I didn't check the speed improvement.

Paul

> From: Joe Ramsay <Joe.Ramsay@arm.com>
> CC: Joe Ramsay <Joe.Ramsay@arm.com>
> Date: Fri, 1 Dec 2023 09:49:45 +0000
> NoDisclaimer: true
> 
> New implementation is based on the existing exp/exp2, with different
> reduction constants and polynomial. Worst-case error in round-to-
> nearest is 0.513 ULP.
> 
> The exp/exp2 shared table is reused for exp10 - .rodata size of
> e_exp_data increases by 64 bytes.
> 
> As for exp/exp2, targets with single-instruction rounding/conversion
> intrinsics can use them by toggling TOINT_INTRINSICS=1 and adding the
> necessary code to their math_private.h.
> 
> Improvements on Neoverse V1 compared to current GLIBC master:
> exp10 thruput: 3.3x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
> exp10 latency: 1.8x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
> 
> Tested on:
>     aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and
>     x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction)
> ---
> Differences from v1:
> * Stop preventing inlining of special-case - no good reason for this,
>   in fact performance is slightly better if it is inlined
> * Remove configurable wide poly
> * Update max ULP based on several runs of the subdivide program with
>   threshold=1000000
> * Define float constant in hex format
> * No benchtest added, as there is already one for exp10 (I couldn't
>   see a way to have multiple intervals in exp10-inputs?). Anyway the
>   intervals in the previous version's commit message were chosen
>   fairly arbitrarily - added speedup measurement for the interval used
>   in exp10-inputs
> Thanks,
> Joe
>  sysdeps/ieee754/dbl-64/e_exp10.c     | 144 ++++++++++++++++++++++-----
>  sysdeps/ieee754/dbl-64/e_exp_data.c  |  11 ++
>  sysdeps/ieee754/dbl-64/math_config.h |   4 +
>  3 files changed, 135 insertions(+), 24 deletions(-)
> 
> diff --git a/sysdeps/ieee754/dbl-64/e_exp10.c b/sysdeps/ieee754/dbl-64/e_exp10.c
> index fa47f4f922..08069140c0 100644
> --- a/sysdeps/ieee754/dbl-64/e_exp10.c
> +++ b/sysdeps/ieee754/dbl-64/e_exp10.c
> @@ -16,36 +16,132 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <math.h>
> +#include <math-barriers.h>
> +#include <math-narrow-eval.h>
>  #include <math_private.h>
>  #include <float.h>
>  #include <libm-alias-finite.h>
> +#include "math_config.h"
>  
> -static const double log10_high = 0x2.4d7637p0;
> -static const double log10_low = 0x7.6aaa2b05ba95cp-28;
> +#define N (1 << EXP_TABLE_BITS)
> +#define IndexMask (N - 1)
> +#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX).  */
> +#define UFlowBound -0x1.5ep+8 /* -350.  */
> +#define SmallTop 0x3c6 /* top12(0x1p-57).  */
> +#define BigTop 0x407   /* top12(0x1p8).  */
> +#define Thresh 0x41    /* BigTop - SmallTop.  */
> +#define Shift __exp_data.shift
> +#define C(i) __exp_data.exp10_poly[i]
>  
> +static double
> +special_case (uint64_t sbits, double_t tmp, uint64_t ki)
> +{
> +  double_t scale, y;
> +
> +  if (ki - (1ull << 16) < 0x80000000)
> +    {
> +      /* The exponent of scale might have overflowed by 1.  */
> +      sbits -= 1ull << 52;
> +      scale = asdouble (sbits);
> +      y = 2 * (scale + scale * tmp);
> +      return check_oflow (y);
> +    }
> +
> +  /* n < 0, need special care in the subnormal range.  */
> +  sbits += 1022ull << 52;
> +  scale = asdouble (sbits);
> +  y = scale + scale * tmp;
> +
> +  if (y < 1.0)
> +    {
> +      /* Round y to the right precision before scaling it into the subnormal
> +	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
> +	 E is the worst-case ulp error outside the subnormal range.  So this
> +	 is only useful if the goal is better than 1 ulp worst-case error.  */
> +      double_t lo = scale - y + scale * tmp;
> +      double_t hi = 1.0 + y;
> +      lo = 1.0 - hi + y + lo;
> +      y = math_narrow_eval (hi + lo) - 1.0;
> +      /* Avoid -0.0 with downward rounding.  */
> +      if (WANT_ROUNDING && y == 0.0)
> +	y = 0.0;
> +      /* The underflow exception needs to be signaled explicitly.  */
> +      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
> +    }
> +  y = 0x1p-1022 * y;
> +
> +  return check_uflow (y);
> +}
> +
> +/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP.  */
>  double
> -__ieee754_exp10 (double arg)
> +__ieee754_exp10 (double x)
>  {
> -  int32_t lx;
> -  double arg_high, arg_low;
> -  double exp_high, exp_low;
> -
> -  if (!isfinite (arg))
> -    return __ieee754_exp (arg);
> -  if (arg < DBL_MIN_10_EXP - DBL_DIG - 10)
> -    return DBL_MIN * DBL_MIN;
> -  else if (arg > DBL_MAX_10_EXP + 1)
> -    return DBL_MAX * DBL_MAX;
> -  else if (fabs (arg) < 0x1p-56)
> -    return 1.0;
> -
> -  GET_LOW_WORD (lx, arg);
> -  lx &= 0xf8000000;
> -  arg_high = arg;
> -  SET_LOW_WORD (arg_high, lx);
> -  arg_low = arg - arg_high;
> -  exp_high = arg_high * log10_high;
> -  exp_low = arg_high * log10_low + arg_low * M_LN10;
> -  return __ieee754_exp (exp_high) * __ieee754_exp (exp_low);
> +  uint64_t ix = asuint64 (x);
> +  uint32_t abstop = (ix >> 52) & 0x7ff;
> +
> +  if (__glibc_unlikely (abstop - SmallTop >= Thresh))
> +    {
> +      if (abstop - SmallTop >= 0x80000000)
> +	/* Avoid spurious underflow for tiny x.
> +	   Note: 0 is common input.  */
> +	return x + 1;
> +      if (abstop == 0x7ff)
> +	return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
> +      if (x >= OFlowBound)
> +	return __math_oflow (0);
> +      if (x < UFlowBound)
> +	return __math_uflow (0);
> +
> +      /* Large x is special-cased below.  */
> +      abstop = 0;
> +    }
> +
> +  /* Reduce x: z = x * N / log10(2), k = round(z).  */
> +  double_t z = __exp_data.invlog10_2N * x;
> +  double_t kd;
> +  int64_t ki;
> +#if TOINT_INTRINSICS
> +  kd = roundtoint (z);
> +  ki = converttoint (z);
> +#else
> +  kd = math_narrow_eval (z + Shift);
> +  kd -= Shift;
> +  ki = kd;
> +#endif
> +
> +  /* r = x - k * log10(2), r in [-0.5, 0.5].  */
> +  double_t r = x;
> +  r = __exp_data.neglog10_2hiN * kd + r;
> +  r = __exp_data.neglog10_2loN * kd + r;
> +
> +  /* exp10(x) = 2^(k/N) * 2^(r/N).
> +     Approximate the two components separately.  */
> +
> +  /* s = 2^(k/N), using lookup table.  */
> +  uint64_t e = ki << (52 - EXP_TABLE_BITS);
> +  uint64_t i = (ki & IndexMask) * 2;
> +  uint64_t u = __exp_data.tab[i + 1];
> +  uint64_t sbits = u + e;
> +
> +  double_t tail = asdouble (__exp_data.tab[i]);
> +
> +  /* 2^(r/N) ~= 1 + r * Poly(r).  */
> +  double_t r2 = r * r;
> +  double_t p = C (0) + r * C (1);
> +  double_t y = C (2) + r * C (3);
> +  y = y + r2 * C (4);
> +  y = p + r2 * y;
> +  y = tail + y * r;
> +
> +  if (__glibc_unlikely (abstop == 0))
> +    return special_case (sbits, y, ki);
> +
> +  /* Assemble components:
> +     y  = 2^(r/N) * 2^(k/N)
> +       ~= (y + 1) * s.  */
> +  double_t s = asdouble (sbits);
> +  return s * y + s;
>  }
> +
>  libm_alias_finite (__ieee754_exp10, __exp10)
> diff --git a/sysdeps/ieee754/dbl-64/e_exp_data.c b/sysdeps/ieee754/dbl-64/e_exp_data.c
> index d498b8bc3b..5c774afbcb 100644
> --- a/sysdeps/ieee754/dbl-64/e_exp_data.c
> +++ b/sysdeps/ieee754/dbl-64/e_exp_data.c
> @@ -58,6 +58,17 @@ const struct exp_data __exp_data = {
>  0x1.5d7e09b4e3a84p-10,
>  #endif
>  },
> +.invlog10_2N = 0x1.a934f0979a371p1 * N,
> +.neglog10_2hiN = -0x1.3441350ap-2 / N,
> +.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
> +.exp10_poly = {
> +/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ].  */
> +0x1.26bb1bbb55516p1,
> +0x1.53524c73ce9fep1,
> +0x1.0470591ce4b26p1,
> +0x1.2bd76577fe684p0,
> +0x1.1446eeccd0efbp-1
> +},
>  // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
>  // tab[2*k] = asuint64(T[k])
>  // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
> diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
> index 19af33fd86..d617addfbe 100644
> --- a/sysdeps/ieee754/dbl-64/math_config.h
> +++ b/sysdeps/ieee754/dbl-64/math_config.h
> @@ -201,6 +201,10 @@ extern const struct exp_data
>    double poly[4]; /* Last four coefficients.  */
>    double exp2_shift;
>    double exp2_poly[EXP2_POLY_ORDER];
> +  double invlog10_2N;
> +  double neglog10_2hiN;
> +  double neglog10_2loN;
> +  double exp10_poly[5];
>    uint64_t tab[2*(1 << EXP_TABLE_BITS)];
>  } __exp_data attribute_hidden;
>  
> -- 
> 2.27.0
> 
>
  
Szabolcs Nagy Dec. 4, 2023, 3:09 p.m. UTC | #2
The 12/01/2023 09:49, Joe Ramsay wrote:
> New implementation is based on the existing exp/exp2, with different
> reduction constants and polynomial. Worst-case error in round-to-
> nearest is 0.513 ULP.
> 
> The exp/exp2 shared table is reused for exp10 - .rodata size of
> e_exp_data increases by 64 bytes.
> 
> As for exp/exp2, targets with single-instruction rounding/conversion
> intrinsics can use them by toggling TOINT_INTRINSICS=1 and adding the
> necessary code to their math_private.h.
> 
> Improvements on Neoverse V1 compared to current GLIBC master:
> exp10 thruput: 3.3x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
> exp10 latency: 1.8x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
> 
> Tested on:
>     aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and
>     x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction)

thanks, this looks good.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>

i will commit it for you in a bit.

> ---
> Differences from v1:
> * Stop preventing inlining of special-case - no good reason for this,
>   in fact performance is slightly better if it is inlined
> * Remove configurable wide poly
> * Update max ULP based on several runs of the subdivide program with
>   threshold=1000000
> * Define float constant in hex format
> * No benchtest added, as there is already one for exp10 (I couldn't
>   see a way to have multiple intervals in exp10-inputs?). Anyway the
>   intervals in the previous version's commit message were chosen
>   fairly arbitrarily - added speedup measurement for the interval used
>   in exp10-inputs
> Thanks,
> Joe
>  sysdeps/ieee754/dbl-64/e_exp10.c     | 144 ++++++++++++++++++++++-----
>  sysdeps/ieee754/dbl-64/e_exp_data.c  |  11 ++
>  sysdeps/ieee754/dbl-64/math_config.h |   4 +
>  3 files changed, 135 insertions(+), 24 deletions(-)
> 
> diff --git a/sysdeps/ieee754/dbl-64/e_exp10.c b/sysdeps/ieee754/dbl-64/e_exp10.c
> index fa47f4f922..08069140c0 100644
> --- a/sysdeps/ieee754/dbl-64/e_exp10.c
> +++ b/sysdeps/ieee754/dbl-64/e_exp10.c
> @@ -16,36 +16,132 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <math.h>
> +#include <math-barriers.h>
> +#include <math-narrow-eval.h>
>  #include <math_private.h>
>  #include <float.h>
>  #include <libm-alias-finite.h>
> +#include "math_config.h"
>  
> -static const double log10_high = 0x2.4d7637p0;
> -static const double log10_low = 0x7.6aaa2b05ba95cp-28;
> +#define N (1 << EXP_TABLE_BITS)
> +#define IndexMask (N - 1)
> +#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX).  */
> +#define UFlowBound -0x1.5ep+8 /* -350.  */
> +#define SmallTop 0x3c6 /* top12(0x1p-57).  */
> +#define BigTop 0x407   /* top12(0x1p8).  */
> +#define Thresh 0x41    /* BigTop - SmallTop.  */
> +#define Shift __exp_data.shift
> +#define C(i) __exp_data.exp10_poly[i]
>  
> +static double
> +special_case (uint64_t sbits, double_t tmp, uint64_t ki)
> +{
> +  double_t scale, y;
> +
> +  if (ki - (1ull << 16) < 0x80000000)
> +    {
> +      /* The exponent of scale might have overflowed by 1.  */
> +      sbits -= 1ull << 52;
> +      scale = asdouble (sbits);
> +      y = 2 * (scale + scale * tmp);
> +      return check_oflow (y);
> +    }
> +
> +  /* n < 0, need special care in the subnormal range.  */
> +  sbits += 1022ull << 52;
> +  scale = asdouble (sbits);
> +  y = scale + scale * tmp;
> +
> +  if (y < 1.0)
> +    {
> +      /* Round y to the right precision before scaling it into the subnormal
> +	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
> +	 E is the worst-case ulp error outside the subnormal range.  So this
> +	 is only useful if the goal is better than 1 ulp worst-case error.  */
> +      double_t lo = scale - y + scale * tmp;
> +      double_t hi = 1.0 + y;
> +      lo = 1.0 - hi + y + lo;
> +      y = math_narrow_eval (hi + lo) - 1.0;
> +      /* Avoid -0.0 with downward rounding.  */
> +      if (WANT_ROUNDING && y == 0.0)
> +	y = 0.0;
> +      /* The underflow exception needs to be signaled explicitly.  */
> +      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
> +    }
> +  y = 0x1p-1022 * y;
> +
> +  return check_uflow (y);
> +}
> +
> +/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP.  */
>  double
> -__ieee754_exp10 (double arg)
> +__ieee754_exp10 (double x)
>  {
> -  int32_t lx;
> -  double arg_high, arg_low;
> -  double exp_high, exp_low;
> -
> -  if (!isfinite (arg))
> -    return __ieee754_exp (arg);
> -  if (arg < DBL_MIN_10_EXP - DBL_DIG - 10)
> -    return DBL_MIN * DBL_MIN;
> -  else if (arg > DBL_MAX_10_EXP + 1)
> -    return DBL_MAX * DBL_MAX;
> -  else if (fabs (arg) < 0x1p-56)
> -    return 1.0;
> -
> -  GET_LOW_WORD (lx, arg);
> -  lx &= 0xf8000000;
> -  arg_high = arg;
> -  SET_LOW_WORD (arg_high, lx);
> -  arg_low = arg - arg_high;
> -  exp_high = arg_high * log10_high;
> -  exp_low = arg_high * log10_low + arg_low * M_LN10;
> -  return __ieee754_exp (exp_high) * __ieee754_exp (exp_low);
> +  uint64_t ix = asuint64 (x);
> +  uint32_t abstop = (ix >> 52) & 0x7ff;
> +
> +  if (__glibc_unlikely (abstop - SmallTop >= Thresh))
> +    {
> +      if (abstop - SmallTop >= 0x80000000)
> +	/* Avoid spurious underflow for tiny x.
> +	   Note: 0 is common input.  */
> +	return x + 1;
> +      if (abstop == 0x7ff)
> +	return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
> +      if (x >= OFlowBound)
> +	return __math_oflow (0);
> +      if (x < UFlowBound)
> +	return __math_uflow (0);
> +
> +      /* Large x is special-cased below.  */
> +      abstop = 0;
> +    }
> +
> +  /* Reduce x: z = x * N / log10(2), k = round(z).  */
> +  double_t z = __exp_data.invlog10_2N * x;
> +  double_t kd;
> +  int64_t ki;
> +#if TOINT_INTRINSICS
> +  kd = roundtoint (z);
> +  ki = converttoint (z);
> +#else
> +  kd = math_narrow_eval (z + Shift);
> +  kd -= Shift;
> +  ki = kd;
> +#endif
> +
> +  /* r = x - k * log10(2), r in [-0.5, 0.5].  */
> +  double_t r = x;
> +  r = __exp_data.neglog10_2hiN * kd + r;
> +  r = __exp_data.neglog10_2loN * kd + r;
> +
> +  /* exp10(x) = 2^(k/N) * 2^(r/N).
> +     Approximate the two components separately.  */
> +
> +  /* s = 2^(k/N), using lookup table.  */
> +  uint64_t e = ki << (52 - EXP_TABLE_BITS);
> +  uint64_t i = (ki & IndexMask) * 2;
> +  uint64_t u = __exp_data.tab[i + 1];
> +  uint64_t sbits = u + e;
> +
> +  double_t tail = asdouble (__exp_data.tab[i]);
> +
> +  /* 2^(r/N) ~= 1 + r * Poly(r).  */
> +  double_t r2 = r * r;
> +  double_t p = C (0) + r * C (1);
> +  double_t y = C (2) + r * C (3);
> +  y = y + r2 * C (4);
> +  y = p + r2 * y;
> +  y = tail + y * r;
> +
> +  if (__glibc_unlikely (abstop == 0))
> +    return special_case (sbits, y, ki);
> +
> +  /* Assemble components:
> +     y  = 2^(r/N) * 2^(k/N)
> +       ~= (y + 1) * s.  */
> +  double_t s = asdouble (sbits);
> +  return s * y + s;
>  }
> +
>  libm_alias_finite (__ieee754_exp10, __exp10)
> diff --git a/sysdeps/ieee754/dbl-64/e_exp_data.c b/sysdeps/ieee754/dbl-64/e_exp_data.c
> index d498b8bc3b..5c774afbcb 100644
> --- a/sysdeps/ieee754/dbl-64/e_exp_data.c
> +++ b/sysdeps/ieee754/dbl-64/e_exp_data.c
> @@ -58,6 +58,17 @@ const struct exp_data __exp_data = {
>  0x1.5d7e09b4e3a84p-10,
>  #endif
>  },
> +.invlog10_2N = 0x1.a934f0979a371p1 * N,
> +.neglog10_2hiN = -0x1.3441350ap-2 / N,
> +.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
> +.exp10_poly = {
> +/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ].  */
> +0x1.26bb1bbb55516p1,
> +0x1.53524c73ce9fep1,
> +0x1.0470591ce4b26p1,
> +0x1.2bd76577fe684p0,
> +0x1.1446eeccd0efbp-1
> +},
>  // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
>  // tab[2*k] = asuint64(T[k])
>  // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
> diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
> index 19af33fd86..d617addfbe 100644
> --- a/sysdeps/ieee754/dbl-64/math_config.h
> +++ b/sysdeps/ieee754/dbl-64/math_config.h
> @@ -201,6 +201,10 @@ extern const struct exp_data
>    double poly[4]; /* Last four coefficients.  */
>    double exp2_shift;
>    double exp2_poly[EXP2_POLY_ORDER];
> +  double invlog10_2N;
> +  double neglog10_2hiN;
> +  double neglog10_2loN;
> +  double exp10_poly[5];
>    uint64_t tab[2*(1 << EXP_TABLE_BITS)];
>  } __exp_data attribute_hidden;
>  
> -- 
> 2.27.0
>
  
Adhemerval Zanella Dec. 4, 2023, 3:15 p.m. UTC | #3
On 04/12/23 12:09, Szabolcs Nagy wrote:
> The 12/01/2023 09:49, Joe Ramsay wrote:
>> New implementation is based on the existing exp/exp2, with different
>> reduction constants and polynomial. Worst-case error in round-to-
>> nearest is 0.513 ULP.
>>
>> The exp/exp2 shared table is reused for exp10 - .rodata size of
>> e_exp_data increases by 64 bytes.
>>
>> As for exp/exp2, targets with single-instruction rounding/conversion
>> intrinsics can use them by toggling TOINT_INTRINSICS=1 and adding the
>> necessary code to their math_private.h.
>>
>> Improvements on Neoverse V1 compared to current GLIBC master:
>> exp10 thruput: 3.3x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
>> exp10 latency: 1.8x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8]
>>
>> Tested on:
>>     aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and
>>     x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction)
> 
> thanks, this looks good.
> 
> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> 
> i will commit it for you in a bit.

I still this we should provide a benchtest so it would easy to verify
the performance numbers and evaluate whether some architecture might
benefit from providing ifunc variants (as x86 would most likely want
for enable fma).

> 
>> ---
>> Differences from v1:
>> * Stop preventing inlining of special-case - no good reason for this,
>>   in fact performance is slightly better if it is inlined
>> * Remove configurable wide poly
>> * Update max ULP based on several runs of the subdivide program with
>>   threshold=1000000
>> * Define float constant in hex format
>> * No benchtest added, as there is already one for exp10 (I couldn't
>>   see a way to have multiple intervals in exp10-inputs?). Anyway the
>>   intervals in the previous version's commit message were chosen
>>   fairly arbitrarily - added speedup measurement for the interval used
>>   in exp10-inputs
>> Thanks,
>> Joe
>>  sysdeps/ieee754/dbl-64/e_exp10.c     | 144 ++++++++++++++++++++++-----
>>  sysdeps/ieee754/dbl-64/e_exp_data.c  |  11 ++
>>  sysdeps/ieee754/dbl-64/math_config.h |   4 +
>>  3 files changed, 135 insertions(+), 24 deletions(-)
>>
>> diff --git a/sysdeps/ieee754/dbl-64/e_exp10.c b/sysdeps/ieee754/dbl-64/e_exp10.c
>> index fa47f4f922..08069140c0 100644
>> --- a/sysdeps/ieee754/dbl-64/e_exp10.c
>> +++ b/sysdeps/ieee754/dbl-64/e_exp10.c
>> @@ -16,36 +16,132 @@
>>     <https://www.gnu.org/licenses/>.  */
>>  
>>  #include <math.h>
>> +#include <math-barriers.h>
>> +#include <math-narrow-eval.h>
>>  #include <math_private.h>
>>  #include <float.h>
>>  #include <libm-alias-finite.h>
>> +#include "math_config.h"
>>  
>> -static const double log10_high = 0x2.4d7637p0;
>> -static const double log10_low = 0x7.6aaa2b05ba95cp-28;
>> +#define N (1 << EXP_TABLE_BITS)
>> +#define IndexMask (N - 1)
>> +#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX).  */
>> +#define UFlowBound -0x1.5ep+8 /* -350.  */
>> +#define SmallTop 0x3c6 /* top12(0x1p-57).  */
>> +#define BigTop 0x407   /* top12(0x1p8).  */
>> +#define Thresh 0x41    /* BigTop - SmallTop.  */
>> +#define Shift __exp_data.shift
>> +#define C(i) __exp_data.exp10_poly[i]
>>  
>> +static double
>> +special_case (uint64_t sbits, double_t tmp, uint64_t ki)
>> +{
>> +  double_t scale, y;
>> +
>> +  if (ki - (1ull << 16) < 0x80000000)
>> +    {
>> +      /* The exponent of scale might have overflowed by 1.  */
>> +      sbits -= 1ull << 52;
>> +      scale = asdouble (sbits);
>> +      y = 2 * (scale + scale * tmp);
>> +      return check_oflow (y);
>> +    }
>> +
>> +  /* n < 0, need special care in the subnormal range.  */
>> +  sbits += 1022ull << 52;
>> +  scale = asdouble (sbits);
>> +  y = scale + scale * tmp;
>> +
>> +  if (y < 1.0)
>> +    {
>> +      /* Round y to the right precision before scaling it into the subnormal
>> +	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
>> +	 E is the worst-case ulp error outside the subnormal range.  So this
>> +	 is only useful if the goal is better than 1 ulp worst-case error.  */
>> +      double_t lo = scale - y + scale * tmp;
>> +      double_t hi = 1.0 + y;
>> +      lo = 1.0 - hi + y + lo;
>> +      y = math_narrow_eval (hi + lo) - 1.0;
>> +      /* Avoid -0.0 with downward rounding.  */
>> +      if (WANT_ROUNDING && y == 0.0)
>> +	y = 0.0;
>> +      /* The underflow exception needs to be signaled explicitly.  */
>> +      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
>> +    }
>> +  y = 0x1p-1022 * y;
>> +
>> +  return check_uflow (y);
>> +}
>> +
>> +/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP.  */
>>  double
>> -__ieee754_exp10 (double arg)
>> +__ieee754_exp10 (double x)
>>  {
>> -  int32_t lx;
>> -  double arg_high, arg_low;
>> -  double exp_high, exp_low;
>> -
>> -  if (!isfinite (arg))
>> -    return __ieee754_exp (arg);
>> -  if (arg < DBL_MIN_10_EXP - DBL_DIG - 10)
>> -    return DBL_MIN * DBL_MIN;
>> -  else if (arg > DBL_MAX_10_EXP + 1)
>> -    return DBL_MAX * DBL_MAX;
>> -  else if (fabs (arg) < 0x1p-56)
>> -    return 1.0;
>> -
>> -  GET_LOW_WORD (lx, arg);
>> -  lx &= 0xf8000000;
>> -  arg_high = arg;
>> -  SET_LOW_WORD (arg_high, lx);
>> -  arg_low = arg - arg_high;
>> -  exp_high = arg_high * log10_high;
>> -  exp_low = arg_high * log10_low + arg_low * M_LN10;
>> -  return __ieee754_exp (exp_high) * __ieee754_exp (exp_low);
>> +  uint64_t ix = asuint64 (x);
>> +  uint32_t abstop = (ix >> 52) & 0x7ff;
>> +
>> +  if (__glibc_unlikely (abstop - SmallTop >= Thresh))
>> +    {
>> +      if (abstop - SmallTop >= 0x80000000)
>> +	/* Avoid spurious underflow for tiny x.
>> +	   Note: 0 is common input.  */
>> +	return x + 1;
>> +      if (abstop == 0x7ff)
>> +	return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
>> +      if (x >= OFlowBound)
>> +	return __math_oflow (0);
>> +      if (x < UFlowBound)
>> +	return __math_uflow (0);
>> +
>> +      /* Large x is special-cased below.  */
>> +      abstop = 0;
>> +    }
>> +
>> +  /* Reduce x: z = x * N / log10(2), k = round(z).  */
>> +  double_t z = __exp_data.invlog10_2N * x;
>> +  double_t kd;
>> +  int64_t ki;
>> +#if TOINT_INTRINSICS
>> +  kd = roundtoint (z);
>> +  ki = converttoint (z);
>> +#else
>> +  kd = math_narrow_eval (z + Shift);
>> +  kd -= Shift;
>> +  ki = kd;
>> +#endif
>> +
>> +  /* r = x - k * log10(2), r in [-0.5, 0.5].  */
>> +  double_t r = x;
>> +  r = __exp_data.neglog10_2hiN * kd + r;
>> +  r = __exp_data.neglog10_2loN * kd + r;
>> +
>> +  /* exp10(x) = 2^(k/N) * 2^(r/N).
>> +     Approximate the two components separately.  */
>> +
>> +  /* s = 2^(k/N), using lookup table.  */
>> +  uint64_t e = ki << (52 - EXP_TABLE_BITS);
>> +  uint64_t i = (ki & IndexMask) * 2;
>> +  uint64_t u = __exp_data.tab[i + 1];
>> +  uint64_t sbits = u + e;
>> +
>> +  double_t tail = asdouble (__exp_data.tab[i]);
>> +
>> +  /* 2^(r/N) ~= 1 + r * Poly(r).  */
>> +  double_t r2 = r * r;
>> +  double_t p = C (0) + r * C (1);
>> +  double_t y = C (2) + r * C (3);
>> +  y = y + r2 * C (4);
>> +  y = p + r2 * y;
>> +  y = tail + y * r;
>> +
>> +  if (__glibc_unlikely (abstop == 0))
>> +    return special_case (sbits, y, ki);
>> +
>> +  /* Assemble components:
>> +     y  = 2^(r/N) * 2^(k/N)
>> +       ~= (y + 1) * s.  */
>> +  double_t s = asdouble (sbits);
>> +  return s * y + s;
>>  }
>> +
>>  libm_alias_finite (__ieee754_exp10, __exp10)
>> diff --git a/sysdeps/ieee754/dbl-64/e_exp_data.c b/sysdeps/ieee754/dbl-64/e_exp_data.c
>> index d498b8bc3b..5c774afbcb 100644
>> --- a/sysdeps/ieee754/dbl-64/e_exp_data.c
>> +++ b/sysdeps/ieee754/dbl-64/e_exp_data.c
>> @@ -58,6 +58,17 @@ const struct exp_data __exp_data = {
>>  0x1.5d7e09b4e3a84p-10,
>>  #endif
>>  },
>> +.invlog10_2N = 0x1.a934f0979a371p1 * N,
>> +.neglog10_2hiN = -0x1.3441350ap-2 / N,
>> +.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
>> +.exp10_poly = {
>> +/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ].  */
>> +0x1.26bb1bbb55516p1,
>> +0x1.53524c73ce9fep1,
>> +0x1.0470591ce4b26p1,
>> +0x1.2bd76577fe684p0,
>> +0x1.1446eeccd0efbp-1
>> +},
>>  // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
>>  // tab[2*k] = asuint64(T[k])
>>  // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
>> diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
>> index 19af33fd86..d617addfbe 100644
>> --- a/sysdeps/ieee754/dbl-64/math_config.h
>> +++ b/sysdeps/ieee754/dbl-64/math_config.h
>> @@ -201,6 +201,10 @@ extern const struct exp_data
>>    double poly[4]; /* Last four coefficients.  */
>>    double exp2_shift;
>>    double exp2_poly[EXP2_POLY_ORDER];
>> +  double invlog10_2N;
>> +  double neglog10_2hiN;
>> +  double neglog10_2loN;
>> +  double exp10_poly[5];
>>    uint64_t tab[2*(1 << EXP_TABLE_BITS)];
>>  } __exp_data attribute_hidden;
>>  
>> -- 
>> 2.27.0
>>
  
Szabolcs Nagy Dec. 4, 2023, 3:55 p.m. UTC | #4
The 12/04/2023 12:15, Adhemerval Zanella Netto wrote:
> I still this we should provide a benchtest so it would easy to verify
> the performance numbers and evaluate whether some architecture might
> benefit from providing ifunc variants (as x86 would most likely want
> for enable fma).

what is wrong with the existing benchtests/exp10-inputs ?
  
Adhemerval Zanella Dec. 4, 2023, 4:37 p.m. UTC | #5
> On 4 Dec 2023, at 12:55, Szabolcs Nagy <szabolcs.nagy@arm.com> wrote:
> The 12/04/2023 12:15, Adhemerval Zanella Netto wrote:
>> I still this we should provide a benchtest so it would easy to verify
>> the performance numbers and evaluate whether some architecture might
>> benefit from providing ifunc variants (as x86 would most likely want
>> for enable fma).
> 
> what is wrong with the existing benchtests/exp10-inputs ?
> 

In fact nothing, I had the impression the commit message has a range different from benchtest. Sorry about the noite, patch LGTM.
  

Patch

diff --git a/sysdeps/ieee754/dbl-64/e_exp10.c b/sysdeps/ieee754/dbl-64/e_exp10.c
index fa47f4f922..08069140c0 100644
--- a/sysdeps/ieee754/dbl-64/e_exp10.c
+++ b/sysdeps/ieee754/dbl-64/e_exp10.c
@@ -16,36 +16,132 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <math.h>
+#include <math-barriers.h>
+#include <math-narrow-eval.h>
 #include <math_private.h>
 #include <float.h>
 #include <libm-alias-finite.h>
+#include "math_config.h"
 
-static const double log10_high = 0x2.4d7637p0;
-static const double log10_low = 0x7.6aaa2b05ba95cp-28;
+#define N (1 << EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX).  */
+#define UFlowBound -0x1.5ep+8 /* -350.  */
+#define SmallTop 0x3c6 /* top12(0x1p-57).  */
+#define BigTop 0x407   /* top12(0x1p8).  */
+#define Thresh 0x41    /* BigTop - SmallTop.  */
+#define Shift __exp_data.shift
+#define C(i) __exp_data.exp10_poly[i]
 
+static double
+special_case (uint64_t sbits, double_t tmp, uint64_t ki)
+{
+  double_t scale, y;
+
+  if (ki - (1ull << 16) < 0x80000000)
+    {
+      /* The exponent of scale might have overflowed by 1.  */
+      sbits -= 1ull << 52;
+      scale = asdouble (sbits);
+      y = 2 * (scale + scale * tmp);
+      return check_oflow (y);
+    }
+
+  /* n < 0, need special care in the subnormal range.  */
+  sbits += 1022ull << 52;
+  scale = asdouble (sbits);
+  y = scale + scale * tmp;
+
+  if (y < 1.0)
+    {
+      /* Round y to the right precision before scaling it into the subnormal
+	 range to avoid double rounding that can cause 0.5+E/2 ulp error where
+	 E is the worst-case ulp error outside the subnormal range.  So this
+	 is only useful if the goal is better than 1 ulp worst-case error.  */
+      double_t lo = scale - y + scale * tmp;
+      double_t hi = 1.0 + y;
+      lo = 1.0 - hi + y + lo;
+      y = math_narrow_eval (hi + lo) - 1.0;
+      /* Avoid -0.0 with downward rounding.  */
+      if (WANT_ROUNDING && y == 0.0)
+	y = 0.0;
+      /* The underflow exception needs to be signaled explicitly.  */
+      math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
+    }
+  y = 0x1p-1022 * y;
+
+  return check_uflow (y);
+}
+
+/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP.  */
 double
-__ieee754_exp10 (double arg)
+__ieee754_exp10 (double x)
 {
-  int32_t lx;
-  double arg_high, arg_low;
-  double exp_high, exp_low;
-
-  if (!isfinite (arg))
-    return __ieee754_exp (arg);
-  if (arg < DBL_MIN_10_EXP - DBL_DIG - 10)
-    return DBL_MIN * DBL_MIN;
-  else if (arg > DBL_MAX_10_EXP + 1)
-    return DBL_MAX * DBL_MAX;
-  else if (fabs (arg) < 0x1p-56)
-    return 1.0;
-
-  GET_LOW_WORD (lx, arg);
-  lx &= 0xf8000000;
-  arg_high = arg;
-  SET_LOW_WORD (arg_high, lx);
-  arg_low = arg - arg_high;
-  exp_high = arg_high * log10_high;
-  exp_low = arg_high * log10_low + arg_low * M_LN10;
-  return __ieee754_exp (exp_high) * __ieee754_exp (exp_low);
+  uint64_t ix = asuint64 (x);
+  uint32_t abstop = (ix >> 52) & 0x7ff;
+
+  if (__glibc_unlikely (abstop - SmallTop >= Thresh))
+    {
+      if (abstop - SmallTop >= 0x80000000)
+	/* Avoid spurious underflow for tiny x.
+	   Note: 0 is common input.  */
+	return x + 1;
+      if (abstop == 0x7ff)
+	return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
+      if (x >= OFlowBound)
+	return __math_oflow (0);
+      if (x < UFlowBound)
+	return __math_uflow (0);
+
+      /* Large x is special-cased below.  */
+      abstop = 0;
+    }
+
+  /* Reduce x: z = x * N / log10(2), k = round(z).  */
+  double_t z = __exp_data.invlog10_2N * x;
+  double_t kd;
+  int64_t ki;
+#if TOINT_INTRINSICS
+  kd = roundtoint (z);
+  ki = converttoint (z);
+#else
+  kd = math_narrow_eval (z + Shift);
+  kd -= Shift;
+  ki = kd;
+#endif
+
+  /* r = x - k * log10(2), r in [-0.5, 0.5].  */
+  double_t r = x;
+  r = __exp_data.neglog10_2hiN * kd + r;
+  r = __exp_data.neglog10_2loN * kd + r;
+
+  /* exp10(x) = 2^(k/N) * 2^(r/N).
+     Approximate the two components separately.  */
+
+  /* s = 2^(k/N), using lookup table.  */
+  uint64_t e = ki << (52 - EXP_TABLE_BITS);
+  uint64_t i = (ki & IndexMask) * 2;
+  uint64_t u = __exp_data.tab[i + 1];
+  uint64_t sbits = u + e;
+
+  double_t tail = asdouble (__exp_data.tab[i]);
+
+  /* 2^(r/N) ~= 1 + r * Poly(r).  */
+  double_t r2 = r * r;
+  double_t p = C (0) + r * C (1);
+  double_t y = C (2) + r * C (3);
+  y = y + r2 * C (4);
+  y = p + r2 * y;
+  y = tail + y * r;
+
+  if (__glibc_unlikely (abstop == 0))
+    return special_case (sbits, y, ki);
+
+  /* Assemble components:
+     y  = 2^(r/N) * 2^(k/N)
+       ~= (y + 1) * s.  */
+  double_t s = asdouble (sbits);
+  return s * y + s;
 }
+
 libm_alias_finite (__ieee754_exp10, __exp10)
diff --git a/sysdeps/ieee754/dbl-64/e_exp_data.c b/sysdeps/ieee754/dbl-64/e_exp_data.c
index d498b8bc3b..5c774afbcb 100644
--- a/sysdeps/ieee754/dbl-64/e_exp_data.c
+++ b/sysdeps/ieee754/dbl-64/e_exp_data.c
@@ -58,6 +58,17 @@  const struct exp_data __exp_data = {
 0x1.5d7e09b4e3a84p-10,
 #endif
 },
+.invlog10_2N = 0x1.a934f0979a371p1 * N,
+.neglog10_2hiN = -0x1.3441350ap-2 / N,
+.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
+.exp10_poly = {
+/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ].  */
+0x1.26bb1bbb55516p1,
+0x1.53524c73ce9fep1,
+0x1.0470591ce4b26p1,
+0x1.2bd76577fe684p0,
+0x1.1446eeccd0efbp-1
+},
 // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
 // tab[2*k] = asuint64(T[k])
 // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index 19af33fd86..d617addfbe 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -201,6 +201,10 @@  extern const struct exp_data
   double poly[4]; /* Last four coefficients.  */
   double exp2_shift;
   double exp2_poly[EXP2_POLY_ORDER];
+  double invlog10_2N;
+  double neglog10_2hiN;
+  double neglog10_2loN;
+  double exp10_poly[5];
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data attribute_hidden;