Patchwork Speedup tanf range reduction

login
register
mail settings
Submitter Wilco Dijkstra
Date Aug. 22, 2018, 12:29 p.m.
Message ID <HE1PR08MB10357845B62EA7A4DFCD927783300@HE1PR08MB1035.eurprd08.prod.outlook.com>
Download mbox | patch
Permalink /patch/29011/
State New
Headers show

Comments

Wilco Dijkstra - Aug. 22, 2018, 12:29 p.m.
Speedup tanf range reduction by using the new sincosf range
reduction algorithm.  Overall code quality is improved due to
inlining, so there is a speedup even if no range reduction is
required.

Passes GLIBC testsuite on AArch64.  Some files are no longer
required - they are removed in the next patch.

tanf througput gains on Cortex-A72:
* |x| < M_PI_4 : 1.1x
* |x| < M_PI_2  : 1.2x
* |x| < 2 * M_PI: 1.5x
* |x| < 120.0   : 1.6x
* |x| < Inf     : 12.1x

ChangeLog:
2018-08-22  Wilco Dijkstra  <wdijkstr@arm.com>

	* sysdeps/ieee754/flt-32/s_tanf.c (__tanf): Use fast range reduction.

--
Joseph Myers - Aug. 22, 2018, 12:34 p.m.
On Wed, 22 Aug 2018, Wilco Dijkstra wrote:

> +
> +static inline int32_t
> +rem_pio2f (float x, float *y)

Please put a comment on this function documenting its semantics.
Adhemerval Zanella Netto - Aug. 22, 2018, 1:05 p.m.
On 22/08/2018 09:29, Wilco Dijkstra wrote:
> Speedup tanf range reduction by using the new sincosf range
> reduction algorithm.  Overall code quality is improved due to
> inlining, so there is a speedup even if no range reduction is
> required.
> 
> Passes GLIBC testsuite on AArch64.  Some files are no longer
> required - they are removed in the next patch.
> 
> tanf througput gains on Cortex-A72:
> * |x| < M_PI_4 : 1.1x
> * |x| < M_PI_2  : 1.2x
> * |x| < 2 * M_PI: 1.5x
> * |x| < 120.0   : 1.6x
> * |x| < Inf     : 12.1x
> 
> ChangeLog:
> 2018-08-22  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* sysdeps/ieee754/flt-32/s_tanf.c (__tanf): Use fast range reduction.

tanf seems to be the only place where __ieee754_rem_pio2f is used, so we
can remove sysdeps/ieee754/flt-32/e_rem_pio2f.c and
sysdeps/powerpc/fpu/e_rem_pio2f.c.  Also, I would like to certify that
it is faster than powerpc version (I would expect so, but at least it
would be good to actually get some numbers).

> 
> --
> diff --git a/sysdeps/ieee754/flt-32/s_tanf.c b/sysdeps/ieee754/flt-32/s_tanf.c
> index ba3af54913669e4abdfd864307856ec44138f9b9..a397665c4bab7785049935ef526472afedf82e34 100644
> --- a/sysdeps/ieee754/flt-32/s_tanf.c
> +++ b/sysdeps/ieee754/flt-32/s_tanf.c
> @@ -21,6 +21,30 @@ static char rcsid[] = "$NetBSD: s_tanf.c,v 1.4 1995/05/10 20:48:20 jtc Exp $";
>  #include <math.h>
>  #include <math_private.h>
>  #include <libm-alias-float.h>
> +#include "s_sincosf.h"
> +
> +static inline int32_t
> +rem_pio2f (float x, float *y)
> +{
> +  double dx = x;
> +  int n;
> +  const sincos_t *p = &__sincosf_table[0];
> +
> +  if (__glibc_likely (abstop12 (x) < abstop12 (120.0f)))
> +    dx = reduce_fast (dx, p, &n);
> +  else
> +    {
> +      uint32_t xi = asuint (x);
> +      int sign = xi >> 31;
> +
> +      dx = reduce_large (xi, &n);
> +      dx = sign ? -dx : dx;
> +    }
> +
> +  y[0] = dx;
> +  y[1] = dx - y[0];
> +  return n;
> +}
>  
>  float __tanf(float x)
>  {
> @@ -42,7 +66,7 @@ float __tanf(float x)
>  
>      /* argument reduction needed */
>  	else {
> -	    n = __ieee754_rem_pio2f(x,y);
> +	    n = rem_pio2f(x,y);
>  	    return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
>  							      -1 -- n odd */
>  	}
>

Patch

diff --git a/sysdeps/ieee754/flt-32/s_tanf.c b/sysdeps/ieee754/flt-32/s_tanf.c
index ba3af54913669e4abdfd864307856ec44138f9b9..a397665c4bab7785049935ef526472afedf82e34 100644
--- a/sysdeps/ieee754/flt-32/s_tanf.c
+++ b/sysdeps/ieee754/flt-32/s_tanf.c
@@ -21,6 +21,30 @@  static char rcsid[] = "$NetBSD: s_tanf.c,v 1.4 1995/05/10 20:48:20 jtc Exp $";
 #include <math.h>
 #include <math_private.h>
 #include <libm-alias-float.h>
+#include "s_sincosf.h"
+
+static inline int32_t
+rem_pio2f (float x, float *y)
+{
+  double dx = x;
+  int n;
+  const sincos_t *p = &__sincosf_table[0];
+
+  if (__glibc_likely (abstop12 (x) < abstop12 (120.0f)))
+    dx = reduce_fast (dx, p, &n);
+  else
+    {
+      uint32_t xi = asuint (x);
+      int sign = xi >> 31;
+
+      dx = reduce_large (xi, &n);
+      dx = sign ? -dx : dx;
+    }
+
+  y[0] = dx;
+  y[1] = dx - y[0];
+  return n;
+}
 
 float __tanf(float x)
 {
@@ -42,7 +66,7 @@  float __tanf(float x)
 
     /* argument reduction needed */
 	else {
-	    n = __ieee754_rem_pio2f(x,y);
+	    n = rem_pio2f(x,y);
 	    return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
 							      -1 -- n odd */
 	}