Patchwork [6/6] Remove slow paths from sin/cos

login
register
mail settings
Submitter Wilco Dijkstra
Date March 9, 2018, 3:52 p.m.
Message ID <DB6PR0801MB20536DE50F24E83BB7B6182383DE0@DB6PR0801MB2053.eurprd08.prod.outlook.com>
Download mbox | patch
Permalink /patch/26255/
State New
Headers show

Comments

Wilco Dijkstra - March 9, 2018, 3:52 p.m.
Restructure the sincos implementation - rather than rely on odd partial inlining
of preprocessed portions from sin and cos, explicitly write out the cases.

ChangeLog:
2018-03-09  Wilco Dijkstra  <wdijkstr@arm.com>

	* sysdeps/ieee754/dbl-64/s_sin.c (__sin): Cleanup ifdefs.
	(__cos): Likewise.
	* sysdeps/ieee754/dbl-64/s_sin.c (__sincos): Reimplement using the same
	logic as sin and cos.

--
Siddhesh Poyarekar - March 9, 2018, 4:02 p.m.
On Friday 09 March 2018 09:22 PM, Wilco Dijkstra wrote:
> Restructure the sincos implementation - rather than rely on odd partial inlining
> of preprocessed portions from sin and cos, explicitly write out the cases.

The intention of keeping the inlines was to avoid duplicating code
across files.  With this change one now must remember to make changes in
both files at all times, increasing the chances of an error.  Do you see
any gain from duplicating code?

Siddhesh
Wilco Dijkstra - March 12, 2018, 5:31 p.m.
Siddhesh Poyarekar wrote:

> The intention of keeping the inlines was to avoid duplicating code
> across files.  With this change one now must remember to make changes in
> both files at all times, increasing the chances of an error.  Do you see
> any gain from duplicating code?

I had lots of issues due to parts of functions being included in sincos.c, so I had to
disable sincos during development. The new version is much easier to maintain.

This patch on its own gives a 16-20% speedup between 0 and 8*PI (this speedup
continues up to 2^27). The overall speedup of sincos is 48% over this range, and
between 0 and PI it is 66% faster. So I'd say specializing the small cases in sincos
gives a significant gain.

Wilco
Siddhesh Poyarekar - March 13, 2018, 8:45 a.m.
On Monday 12 March 2018 11:01 PM, Wilco Dijkstra wrote:
> I had lots of issues due to parts of functions being included in sincos.c, so I had to
> disable sincos during development. The new version is much easier to maintain.

That's subjective; someone half a decade down the line might differ when
they see bits of code duplicated across files :)

> This patch on its own gives a 16-20% speedup between 0 and 8*PI (this speedup
> continues up to 2^27). The overall speedup of sincos is 48% over this range, and
> between 0 and PI it is 66% faster. So I'd say specializing the small cases in sincos
> gives a significant gain.

That's a good rationale, please include it in your commit log.

Siddhesh
Wilco Dijkstra - March 13, 2018, 3:29 p.m.
Siddhesh Poyarekar wrote:
> On Monday 12 March 2018 11:01 PM, Wilco Dijkstra wrote:
>> I had lots of issues due to parts of functions being included in sincos.c, so I had to
>> disable sincos during development. The new version is much easier to maintain.
>
> That's subjective; someone half a decade down the line might differ when
> they see bits of code duplicated across files :)

This code won't last half a decade - it should be further simplified and made much faster.

This pattern is repeated 6 times with slight variations. It may be feasible to
merge it all into do_sin:

          if (k < 0x3fd00000)
            {
              xx = x * x;
              t = POLYNOMIAL (xx) * (xx * x);
              *sinx = x + t;
            }
          else
            *sinx = __copysign (do_sin (x, 0), x);

The other is:

      /* |x| < 2.426265.  */
      y = hp0 - fabs (x);
      a = y + hp1;
      da = (y - a) + hp1;

That's tiny enough that I don't see an issue.

Wilco

Patch

diff --git a/sysdeps/ieee754/dbl-64/s_sin.c b/sysdeps/ieee754/dbl-64/s_sin.c
index 91b0abc9c3ac21dae0e673576940ef97bfd20c23..8f804a42e6d94652a62f81b2e0b053135cf9f03a 100644
--- a/sysdeps/ieee754/dbl-64/s_sin.c
+++ b/sysdeps/ieee754/dbl-64/s_sin.c
@@ -208,12 +208,9 @@  do_sincos (double a, double da, int4 n)
 /* An ultimate sin routine. Given an IEEE double machine number x   */
 /* it computes the correctly rounded (to nearest) value of sin(x)  */
 /*******************************************************************/
-#ifdef IN_SINCOS
-static double
-#else
+#ifndef IN_SINCOS
 double
 SECTION
-#endif
 __sin (double x)
 {
   double xx, t, a, da;
@@ -221,9 +218,7 @@  __sin (double x)
   int4 k, m, n;
   double retval = 0;
 
-#ifndef IN_SINCOS
   SET_RESTORE_ROUND_53BIT (FE_TONEAREST);
-#endif
 
   u.x = x;
   m = u.i[HIGH_HALF];
@@ -257,7 +252,6 @@  __sin (double x)
       retval = __copysign (do_cos (t, hp1), x);
     }				/*   else  if (k < 0x400368fd)    */
 
-#ifndef IN_SINCOS
 /*-------------------------- 2.426265<|x|< 105414350 ----------------------*/
   else if (k < 0x419921FB)
     {
@@ -278,12 +272,6 @@  __sin (double x)
 	__set_errno (EDOM);
       retval = x / x;
     }
-#else
-    /* Disable warning... */
-    n = 0, n = n;
-    a = 0, a = a;
-    da = 0, da = da;
-#endif
 
   return retval;
 }
@@ -294,12 +282,8 @@  __sin (double x)
 /* it computes the correctly rounded (to nearest) value of cos(x)  */
 /*******************************************************************/
 
-#ifdef IN_SINCOS
-static double
-#else
 double
 SECTION
-#endif
 __cos (double x)
 {
   double y, xx, a, da;
@@ -308,9 +292,7 @@  __cos (double x)
 
   double retval = 0;
 
-#ifndef IN_SINCOS
   SET_RESTORE_ROUND_53BIT (FE_TONEAREST);
-#endif
 
   u.x = x;
   m = u.i[HIGH_HALF];
@@ -340,8 +322,6 @@  __cos (double x)
 	retval = __copysign (do_sin (a, da), a);
     }				/*   else  if (k < 0x400368fd)    */
 
-
-#ifndef IN_SINCOS
   else if (k < 0x419921FB)
     {				/* 2.426265<|x|< 105414350 */
       n = reduce_sincos (x, &a, &da);
@@ -361,10 +341,6 @@  __cos (double x)
 	__set_errno (EDOM);
       retval = x / x;		/* |x| > 2^1024 */
     }
-#else
-    /* Disable warning... */
-    n = 0, n = n;
-#endif
 
   return retval;
 }
@@ -375,3 +351,5 @@  libm_alias_double (__cos, cos)
 #ifndef __sin
 libm_alias_double (__sin, sin)
 #endif
+
+#endif
diff --git a/sysdeps/ieee754/dbl-64/s_sincos.c b/sysdeps/ieee754/dbl-64/s_sincos.c
index 4335ecbba3c9894e61c087ac970b392fa73abfab..c04972707b284e37b15e82933a00250cda959985 100644
--- a/sysdeps/ieee754/dbl-64/s_sincos.c
+++ b/sysdeps/ieee754/dbl-64/s_sincos.c
@@ -23,9 +23,7 @@ 
 #include <math_private.h>
 #include <libm-alias-double.h>
 
-#define __sin __sin_local
-#define __cos __cos_local
-#define IN_SINCOS 1
+#define IN_SINCOS
 #include "s_sin.c"
 
 void
@@ -37,31 +35,79 @@  __sincos (double x, double *sinx, double *cosx)
   SET_RESTORE_ROUND_53BIT (FE_TONEAREST);
 
   u.x = x;
-  k = 0x7fffffff & u.i[HIGH_HALF];
+  k = u.i[HIGH_HALF] & 0x7fffffff;
 
   if (k < 0x400368fd)
     {
-      *sinx = __sin_local (x);
-      *cosx = __cos_local (x);
-      return;
-    }
-  if (k < 0x419921FB)
-    {
-      double a, da;
-      int4 n = reduce_sincos (x, &a, &da);
-
-      *sinx = do_sincos (a, da, n);
-      *cosx = do_sincos (a, da, n + 1);
+      double t, xx, a, da, y;
+      /* |x| < 2^-27 => cos (x) = 1, sin (x) = x.  */
+      if (k < 0x3e400000)
+	{
+	  if (k < 0x3e500000)
+	    math_check_force_underflow (x);
+	  *sinx = x;
+	  *cosx = 1.0;
+	  return;
+	}
+      /* |x| < 0.855469.  */
+      else if (k < 0x3feb6000)
+	{
+	  /* |x| < 0.25.  */
+	  if (k < 0x3fd00000)
+	    {
+	      xx = x * x;
+	      t = POLYNOMIAL (xx) * (xx * x);
+	      *sinx = x + t;
+	    }
+	  else
+	    *sinx = __copysign (do_sin (x, 0), x);
+	  *cosx = do_cos (x, 0);
+	  return;
+	}
 
+      /* |x| < 2.426265.  */
+      y = hp0 - fabs (x);
+      a = y + hp1;
+      da = (y - a) + hp1;
+      *sinx = __copysign (do_cos (a, da), x);
+      xx = a * a;
+      if (xx < 0.01588)
+	*cosx = TAYLOR_SIN (xx, a, da);
+      else
+	*cosx = __copysign (do_sin (a, da), a);
       return;
     }
+  /* |x| < 2^1024.  */
   if (k < 0x7ff00000)
     {
-      double a, da;
-      int4 n = __branred (x, &a, &da);
+      double a, da, xx;
+      unsigned int n;
 
-      *sinx = do_sincos (a, da, n);
-      *cosx = do_sincos (a, da, n + 1);
+      /* If |x| < 105414350 use simple range reduction.  */
+      n = k < 0x419921FB ? reduce_sincos (x, &a, &da) : __branred (x, &a, &da);
+      n = n & 3;
+
+      if (n == 1 || n == 2)
+	{
+	  a = -a;
+	  da = -da;
+	}
+
+      if (n & 1)
+	{
+	  double *temp = cosx;
+	  cosx = sinx;
+	  sinx = temp;
+	}
+
+      xx = a * a;
+      if (xx < 0.01588)
+	*sinx = TAYLOR_SIN (xx, a, da);
+      else
+	*sinx = __copysign (do_sin (a, da), a);
+      xx = do_cos (a, da);
+      *cosx = (n & 2) ? -xx : xx;
+      return;
     }
 
   if (isinf (x))