RFC: Improve hypot performance

Message ID VE1PR08MB5599C92B47C5EE081422D0DE839A9@VE1PR08MB5599.eurprd08.prod.outlook.com
State Not applicable
Headers
Series RFC: Improve hypot performance |

Checks

Context Check Description
dj/TryBot-apply_patch fail Patch failed to apply to master at the time it was sent
dj/TryBot-32bit fail Patch series failed to apply

Commit Message

Wilco Dijkstra Nov. 17, 2021, 3:58 p.m. UTC
  Hi Adhemerval,

Here is an early version of a much faster hypot implementation. It uses fma
to significantly outperform the powerpc version both in throughput and
latency. It has a worst-case ULP of ~0.949 and passes the testsuite. The
powerpc version has a worst-case ULP of ~1.21 and several test failures.

It applies on top of your hypot patch series. I didn't optimize the non-fma
case since modern targets have fma. It'll be interesting to compare it on
Power. You'll need to correctly set FAST_FMINMAX to indicate support for
inlined fmin/fmax instructions (this will be added to math_private.h for
targets that have it), without it the code tries to use a conditional move
since using a branch here is really bad for performance.

Cheers,
Wilco

---
  

Comments

Paul A. Clarke Nov. 17, 2021, 7:02 p.m. UTC | #1
On Wed, Nov 17, 2021 at 03:58:53PM +0000, Wilco Dijkstra via Libc-alpha wrote:
> Hi Adhemerval,
> 
> Here is an early version of a much faster hypot implementation. It uses fma
> to significantly outperform the powerpc version both in throughput and
> latency. It has a worst-case ULP of ~0.949 and passes the testsuite. The
> powerpc version has a worst-case ULP of ~1.21 and several test failures.
> 
> It applies on top of your hypot patch series. I didn't optimize the non-fma
> case since modern targets have fma. It'll be interesting to compare it on
> Power. You'll need to correctly set FAST_FMINMAX to indicate support for
> inlined fmin/fmax instructions (this will be added to math_private.h for
> targets that have it), without it the code tries to use a conditional move
> since using a branch here is really bad for performance.

On Power10, this implementation is still has a large delta compared to the
current implementation:

current on Power10:
  "hypot": {
   "workload-random": {
    "duration": 5.27306e+08,
    "iterations": 4.8e+07,
    "reciprocal-throughput": 8.26834,
    "latency": 13.7027,
    "max-throughput": 1.20943e+08,
    "min-throughput": 7.29781e+07
   }
  }

with Adhemerval's patches and this patch:
  "hypot": {
   "workload-random": {
    "duration": 5.34629e+08,
    "iterations": 3.6e+07,
    "reciprocal-throughput": 12.5006,
    "latency": 17.201,
    "max-throughput": 7.99959e+07,
    "min-throughput": 5.81362e+07
   }
  }

PC

> ---
> 
> diff --git a/sysdeps/ieee754/dbl-64/e_hypot.c b/sysdeps/ieee754/dbl-64/e_hypot.c
> index d20bc3e3657e350a1103a8f8477db35ee60399e0..3906711788cff5d66725e5879bb4d6c36dd24dc7 100644
> --- a/sysdeps/ieee754/dbl-64/e_hypot.c
> +++ b/sysdeps/ieee754/dbl-64/e_hypot.c
> @@ -37,73 +37,40 @@
>  #include <math-svid-compat.h>
>  #include <errno.h>
>  
> +#define FAST_FMINMAX 1
> +//#undef __FP_FAST_FMA
> +
> +#define SCALE     0x1p-600
> +#define LARGE_VAL 0x1p+511
> +#define TINY_VAL  0x1p-459
> +#define EPS       0x1p-54
> +
> +
>  static inline double
>  handle_errno (double r)
>  {
> +  r = math_narrow_eval (r);
>    if (isinf (r))
>      __set_errno (ERANGE);
>    return r;
>  }
>  
> -/* sqrt (DBL_EPSILON / 2.0)  */
> -#define SQRT_EPS_DIV_2     0x1.6a09e667f3bcdp-27
> -/* DBL_MIN / (sqrt (DBL_EPSILON / 2.0))   */
> -#define DBL_MIN_THRESHOLD  0x1.6a09e667f3bcdp-996
> -/* eps (double) * sqrt (DBL_MIN))  */
> -#define SCALE              0x1p-563
> -/* 1 / eps (sqrt (DBL_MIN)  */
> -#define INV_SCALE          0x1p+563
> -/* sqrt (DBL_MAX)  */
> -#define SQRT_DBL_MAX       0x1.6a09e667f3bccp+511
> -/* sqrt (DBL_MIN)  */
> -#define SQRT_DBL_MIN       0x1p-511
> -
> -double
> -__hypot (double x, double y)
> +static inline double
> +kernel (double ax, double ay)
>  {
> -  if ((isinf (x) || isinf (y))
> -      && !issignaling (x) && !issignaling (y))
> -    return INFINITY;
> -  if (isnan (x) || isnan (y))
> -    return x + y;
> -
> -  double ax = fabs (x);
> -  double ay = fabs (y);
> -  if (ay > ax)
> -    {
> -      double tmp = ax;
> -      ax = ay;
> -      ay = tmp;
> -    }
> -
> -  /* Widely varying operands.  The DBL_MIN_THRESHOLD check is used to avoid
> -     a spurious underflow from the multiplication.  */
> -  if (ax >= DBL_MIN_THRESHOLD && ay <= ax * SQRT_EPS_DIV_2)
> -    return (ay == 0.0)
> -	   ? ax
> -	   : handle_errno (math_narrow_eval (ax + DBL_TRUE_MIN));
> +  double t1, t2;
> +#ifdef __FP_FAST_FMA
> +  t1 = ay + ay;
> +  t2 = ax - ay;
>  
> -  double scale = SCALE;
> -  if (ax > SQRT_DBL_MAX)
> -    {
> -      ax *= scale;
> -      ay *= scale;
> -      scale = INV_SCALE;
> -    }
> -  else if (ay < SQRT_DBL_MIN)
> -    {
> -      ax /= scale;
> -      ay /= scale;
> -    }
> +  if (t1 >= ax)
> +    return sqrt (fma (t1, ax, t2 * t2));
>    else
> -    scale = 1.0;
> -
> +    return sqrt (fma (ax, ax, ay * ay));
> +#else
>    double h = sqrt (ax * ax + ay * ay);
>  
> -  double t1, t2;
> -  if (h == 0.0)
> -    return h;
> -  else if (h <= 2.0 * ay)
> +  if (h <= 2.0 * ay)
>      {
>        double delta = h - ay;
>        t1 = ax * (2.0 * delta - ax);
> @@ -112,14 +79,57 @@ __hypot (double x, double y)
>    else
>      {
>        double delta = h - ax;
> -      t1 = 2.0 * delta * (ax - 2 * ay);
> +      t1 = 2.0 * delta * (ax - 2.0 * ay);
>        t2 = (4.0 * delta - ay) * ay + delta * delta;
>      }
>    h -= (t1 + t2) / (2.0 * h);
> -  h = math_narrow_eval (h * scale);
> -  math_check_force_underflow_nonneg (h);
> -  return handle_errno (h);
> +  return h;
> +#endif
> +}
> +
> +
> +double
> +__hypot (double x, double y)
> +{
> +  if (!isfinite (x) || !isfinite (y))
> +    {
> +      if ((isinf (x) || isinf (y))
> +	  && !issignaling_inline (x) && !issignaling_inline (y))
> +	return INFINITY;
> +      return x + y;
> +    }
> +
> +  x = fabs (x);
> +  y = fabs (y);
> +
> +  double ax = FAST_FMINMAX ? fmax (x, y) : (x < y ? y : x);
> +  double ay = FAST_FMINMAX ? fmin (x, y) : (x < y ? x : y);
> +
> +  if (__glibc_unlikely (ax > LARGE_VAL))
> +    {
> +      if (__glibc_unlikely (ay <= ax * EPS))
> +	return handle_errno (ax + ay);
> +
> +      return handle_errno (kernel (ax * SCALE, ay * SCALE) / SCALE);
> +    }
> +
> +  if (__glibc_unlikely (ay < TINY_VAL))
> +    {
> +      if (__glibc_unlikely (ax >= ay / EPS))
> +	return math_narrow_eval (ax + ay);
> +
> +      ax = math_narrow_eval (kernel (ax / SCALE, ay / SCALE) * SCALE);
> +      math_check_force_underflow_nonneg (ax);
> +      return ax;
> +    }
> +
> +  /* Common case: ax is not huge and ay is not tiny.  */
> +  if (__glibc_unlikely (ay <= ax * EPS))
> +    return math_narrow_eval (ax + ay);
> +
> +  return math_narrow_eval (kernel (ax, ay));
>  }
> +
>  strong_alias (__hypot, __ieee754_hypot)
>  libm_alias_finite (__ieee754_hypot, __hypot)
>  #if LIBM_SVID_COMPAT
  
Wilco Dijkstra Nov. 18, 2021, 12:37 p.m. UTC | #2
Hi Paul,

> On Power10, this implementation is still has a large delta compared to the
> current implementation:

It's obvious something went wrong here since it is slower than Adhemerval's
version - did you look at the generated code?

It should use fma, and no division or function calls (if you see calls to fmin/fmax
you need to set FAST_FMINMAX to 0).

Cheers,
Wilco
  
Paul A. Clarke Nov. 18, 2021, 3:43 p.m. UTC | #3
On Thu, Nov 18, 2021 at 12:37:24PM +0000, Wilco Dijkstra wrote:
> > On Power10, this implementation is still has a large delta compared to the
> > current implementation:
> 
> It's obvious something went wrong here since it is slower than Adhemerval's
> version - did you look at the generated code?
> 
> It should use fma, and no division or function calls (if you see calls to fmin/fmax
> you need to set FAST_FMINMAX to 0).

Ugh. That was it. I missed that subtlety. (Why would I not want "FAST" to be
true?  :-)

Anyway, with FAST_FMINMAX set to 1, the results are much better, and compare
favorably with the current implementation:

current:
  "hypot": {
   "workload-random": {
    "duration": 5.26088e+08,
    "iterations": 4.8e+07,
    "reciprocal-throughput": 8.2599,
    "latency": 13.6604,
    "max-throughput": 1.21067e+08,
    "min-throughput": 7.32042e+07
   }
  }

with Adhemerval's patch and this patch with FAST_FMINMAX=1:
  "hypot": {
   "workload-random": {
    "duration": 5.30541e+08,
    "iterations": 5.2e+07,
    "reciprocal-throughput": 7.22461,
    "latency": 13.1808,
    "max-throughput": 1.38416e+08,
    "min-throughput": 7.58679e+07
   }
  }

PC
  
Paul A. Clarke Nov. 18, 2021, 4:15 p.m. UTC | #4
On Thu, Nov 18, 2021 at 09:43:12AM -0600, Paul A. Clarke via Libc-alpha wrote:
> On Thu, Nov 18, 2021 at 12:37:24PM +0000, Wilco Dijkstra wrote:
> > > On Power10, this implementation is still has a large delta compared to the
> > > current implementation:
> > 
> > It's obvious something went wrong here since it is slower than Adhemerval's
> > version - did you look at the generated code?
> > 
> > It should use fma, and no division or function calls (if you see calls to fmin/fmax
> > you need to set FAST_FMINMAX to 0).
> 
> Ugh. That was it. I missed that subtlety. (Why would I not want "FAST" to be
> true?  :-)
> 
> Anyway, with FAST_FMINMAX set to 1, the results are much better, and compare

That should say "with FAST_FMINMAX set to *0*".

> favorably with the current implementation:
> 
> current:
>   "hypot": {
>    "workload-random": {
>     "duration": 5.26088e+08,
>     "iterations": 4.8e+07,
>     "reciprocal-throughput": 8.2599,
>     "latency": 13.6604,
>     "max-throughput": 1.21067e+08,
>     "min-throughput": 7.32042e+07
>    }
>   }
> 
> with Adhemerval's patch and this patch with FAST_FMINMAX=1:
>   "hypot": {
>    "workload-random": {
>     "duration": 5.30541e+08,
>     "iterations": 5.2e+07,
>     "reciprocal-throughput": 7.22461,
>     "latency": 13.1808,
>     "max-throughput": 1.38416e+08,
>     "min-throughput": 7.58679e+07
>    }
>   }
> 
> PC
  
Wilco Dijkstra Nov. 18, 2021, 5:39 p.m. UTC | #5
Hi Paul,

Great, that's about 14% higher throughput and 3.5% lower latency!

Looking in more detail, it appears AArch64 may be the only target
which inlines fmin/fmax with -O2, so the default is now off unless
math_private.h overrides it.

Cheers,
Wilco
  

Patch

diff --git a/sysdeps/ieee754/dbl-64/e_hypot.c b/sysdeps/ieee754/dbl-64/e_hypot.c
index d20bc3e3657e350a1103a8f8477db35ee60399e0..3906711788cff5d66725e5879bb4d6c36dd24dc7 100644
--- a/sysdeps/ieee754/dbl-64/e_hypot.c
+++ b/sysdeps/ieee754/dbl-64/e_hypot.c
@@ -37,73 +37,40 @@ 
 #include <math-svid-compat.h>
 #include <errno.h>
 
+#define FAST_FMINMAX 1
+//#undef __FP_FAST_FMA
+
+#define SCALE     0x1p-600
+#define LARGE_VAL 0x1p+511
+#define TINY_VAL  0x1p-459
+#define EPS       0x1p-54
+
+
 static inline double
 handle_errno (double r)
 {
+  r = math_narrow_eval (r);
   if (isinf (r))
     __set_errno (ERANGE);
   return r;
 }
 
-/* sqrt (DBL_EPSILON / 2.0)  */
-#define SQRT_EPS_DIV_2     0x1.6a09e667f3bcdp-27
-/* DBL_MIN / (sqrt (DBL_EPSILON / 2.0))   */
-#define DBL_MIN_THRESHOLD  0x1.6a09e667f3bcdp-996
-/* eps (double) * sqrt (DBL_MIN))  */
-#define SCALE              0x1p-563
-/* 1 / eps (sqrt (DBL_MIN)  */
-#define INV_SCALE          0x1p+563
-/* sqrt (DBL_MAX)  */
-#define SQRT_DBL_MAX       0x1.6a09e667f3bccp+511
-/* sqrt (DBL_MIN)  */
-#define SQRT_DBL_MIN       0x1p-511
-
-double
-__hypot (double x, double y)
+static inline double
+kernel (double ax, double ay)
 {
-  if ((isinf (x) || isinf (y))
-      && !issignaling (x) && !issignaling (y))
-    return INFINITY;
-  if (isnan (x) || isnan (y))
-    return x + y;
-
-  double ax = fabs (x);
-  double ay = fabs (y);
-  if (ay > ax)
-    {
-      double tmp = ax;
-      ax = ay;
-      ay = tmp;
-    }
-
-  /* Widely varying operands.  The DBL_MIN_THRESHOLD check is used to avoid
-     a spurious underflow from the multiplication.  */
-  if (ax >= DBL_MIN_THRESHOLD && ay <= ax * SQRT_EPS_DIV_2)
-    return (ay == 0.0)
-	   ? ax
-	   : handle_errno (math_narrow_eval (ax + DBL_TRUE_MIN));
+  double t1, t2;
+#ifdef __FP_FAST_FMA
+  t1 = ay + ay;
+  t2 = ax - ay;
 
-  double scale = SCALE;
-  if (ax > SQRT_DBL_MAX)
-    {
-      ax *= scale;
-      ay *= scale;
-      scale = INV_SCALE;
-    }
-  else if (ay < SQRT_DBL_MIN)
-    {
-      ax /= scale;
-      ay /= scale;
-    }
+  if (t1 >= ax)
+    return sqrt (fma (t1, ax, t2 * t2));
   else
-    scale = 1.0;
-
+    return sqrt (fma (ax, ax, ay * ay));
+#else
   double h = sqrt (ax * ax + ay * ay);
 
-  double t1, t2;
-  if (h == 0.0)
-    return h;
-  else if (h <= 2.0 * ay)
+  if (h <= 2.0 * ay)
     {
       double delta = h - ay;
       t1 = ax * (2.0 * delta - ax);
@@ -112,14 +79,57 @@  __hypot (double x, double y)
   else
     {
       double delta = h - ax;
-      t1 = 2.0 * delta * (ax - 2 * ay);
+      t1 = 2.0 * delta * (ax - 2.0 * ay);
       t2 = (4.0 * delta - ay) * ay + delta * delta;
     }
   h -= (t1 + t2) / (2.0 * h);
-  h = math_narrow_eval (h * scale);
-  math_check_force_underflow_nonneg (h);
-  return handle_errno (h);
+  return h;
+#endif
+}
+
+
+double
+__hypot (double x, double y)
+{
+  if (!isfinite (x) || !isfinite (y))
+    {
+      if ((isinf (x) || isinf (y))
+	  && !issignaling_inline (x) && !issignaling_inline (y))
+	return INFINITY;
+      return x + y;
+    }
+
+  x = fabs (x);
+  y = fabs (y);
+
+  double ax = FAST_FMINMAX ? fmax (x, y) : (x < y ? y : x);
+  double ay = FAST_FMINMAX ? fmin (x, y) : (x < y ? x : y);
+
+  if (__glibc_unlikely (ax > LARGE_VAL))
+    {
+      if (__glibc_unlikely (ay <= ax * EPS))
+	return handle_errno (ax + ay);
+
+      return handle_errno (kernel (ax * SCALE, ay * SCALE) / SCALE);
+    }
+
+  if (__glibc_unlikely (ay < TINY_VAL))
+    {
+      if (__glibc_unlikely (ax >= ay / EPS))
+	return math_narrow_eval (ax + ay);
+
+      ax = math_narrow_eval (kernel (ax / SCALE, ay / SCALE) * SCALE);
+      math_check_force_underflow_nonneg (ax);
+      return ax;
+    }
+
+  /* Common case: ax is not huge and ay is not tiny.  */
+  if (__glibc_unlikely (ay <= ax * EPS))
+    return math_narrow_eval (ax + ay);
+
+  return math_narrow_eval (kernel (ax, ay));
 }
+
 strong_alias (__hypot, __ieee754_hypot)
 libm_alias_finite (__ieee754_hypot, __hypot)
 #if LIBM_SVID_COMPAT