On 06/07/17 10:55, Ashwin Sekhar T K wrote:
> The algorithm is based on the sinf and cosf implementations
> for powerpc in ./powerpc/powerpc64/power8/fpu and that for
> x86 in ./x86_64/fpu.
>
> * sysdeps/aarch64/fpu/chebyshev.h: New file.
> * sysdeps/aarch64/fpu/reduce.h: Likewise.
> * sysdeps/aarch64/fpu/s_cosf.c: Likewise.
> * sysdeps/aarch64/fpu/s_sinf.c: Likewise.
better than asm, but i think this should be the generic
implementation if all targets want it.
> +/* sin (x) = x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
> + where 2^-5<=|x|<Pi/4. */
> +static inline double
> +chebyshev_sin_polynomial_1 (double x)
> +{
> + double S0, S1, S2, S3, S4, y;
> +
> + S0 = -1.66666666666265311791406134034332e-01;
> + S1 = 8.33333332439092043519845987020744e-03;
> + S2 = -1.98412633515623692105969699817081e-04;
> + S3 = 2.75552591873811586009688362475245e-06;
> + S4 = -2.47545996176987174320511533257699e-08;
> + y = x * x;
> + return x * (1.0 + y * (S0 + y * (S1 + y * (S2 + y * (S3 + y * S4)))));
> +}
i think one less coefficient should be enough
(may be optimized for max norm instead of chebyshev)
and modern cores tend to have more than one fp
pipelines (most targets, not just aarch64) so
evaluating the polynomial differently would be
faster (more multiplications but less latency
since they can be done in parallel).
> +/* 4/Pi broken into sum of positive DP values. */
> +static double INVPIO4_TABLE[25] = {
> + 0.00000000000000000000000000000000e+00,
> + 1.27323953807353973388671875000000e+00,
> + 6.66162294771233121082332218065858e-09,
> + 4.55202009562027200395410188316081e-18,
> + 5.05365203780056430379174094616698e-26,
> + 3.33657353140390501092355175630980e-34,
> + 2.31774265657771014271759915567725e-43,
> + 1.41079183488085906188916890478151e-51,
> + 1.78201357714620429447917285584916e-59,
> + 6.45440934111020426845713721490652e-68,
> + 2.96289605657163538186678664280422e-77,
> + 2.34290278673081796193027756956770e-85,
> + 6.89165747744598758512920848666612e-94,
> + 2.61827895738527799147711877424548e-102,
> + 5.22516501694879285329083609946870e-111,
> + 2.31723558129677581012126324525784e-119,
> + 5.36762980505877895920512518100769e-128,
> + 2.49914273179058265651805723374739e-135,
> + 3.32028340088181692034775714274009e-144,
> + 1.98261407071607720791943444409774e-152,
> + 1.34423330943468258263688817309715e-162,
> + 2.61360711509865349546753597258351e-169,
> + 2.26640747502086603170125306310953e-178,
> + 2.39096791372273354372455558796085e-186,
> + 2.14336400443697767564443045577643e-194
> +};
> +
> +/* Range Reduction for Pi/4<=|x|<Inf
> +
> + Returns n and y, where
> + y = |x| - j * Pi/4,
> + j = (k + 1) & 0xfffffffe,
> + n = (k + 1) & 0x7,
> + k = trunc (|x| / (Pi / 4)).
> +
> + In other words,
> + n is the pi/4 octant that x falls into in a trignometric plane.
> + y is the difference between x and the multiple of pi/2 closest to x. */
> +
i guess this is fine, but it is possible to do arg reduction
in a simpler way with mostly int arithmetics and then less
data is needed (24bytes instead of 200).
> +float
> +__cosf (float x)
> +{
> + uint32_t ix;
> +
> + GET_FLOAT_WORD (ix, x);
> + ix &= I_SPABS_MASK;
> +
> + if (ix >= I_SPABS_PIO4)
> + goto large_args;
> +
> + if (ix < I_SPABS_2POWN5)
> + goto small_args;
> +
> + /* Here if 2^-5<=|x|<Pi/4. */
> + return chebyshev_cos_polynomial_1 (x);
> +
> +large_args:
> + if (ix < I_SPABS_INF)
> + {
> + /* Here if Pi/4<=|x|<Inf. */
> + double t, sign, val;
> + int n;
> +
> + n = reducef (x, &t);
> +
> + if (((n + 2) >> 2) & 0x1)
> + sign = -1.0;
> + else
> + sign = 1.0;
> +
> + if ((n + 2) & 0x2)
> + val = chebyshev_cos_polynomial_1 (t);
> + else
> + val = chebyshev_sin_polynomial_1 (t);
> +
> + return sign * val;
> + }
> + /* Here if x is Inf or Nan. */
> + if (ix == I_SPABS_INF)
> + __set_errno (EDOM);
> + return x - x;
i'd do the failure handling separately (tailcall)
so __sinf and __cosf are leaf functions.
new file mode 100644
@@ -0,0 +1,84 @@
+/* Functions for evaluating chebysev polynomials for sin and cos
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef __CHEBYSHEV_H__
+#define __CHEBYSHEV_H__
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+
+/* sin (x) = x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))),
+ where 2^-5<=|x|<Pi/4. */
+static inline double
+chebyshev_sin_polynomial_1 (double x)
+{
+ double S0, S1, S2, S3, S4, y;
+
+ S0 = -1.66666666666265311791406134034332e-01;
+ S1 = 8.33333332439092043519845987020744e-03;
+ S2 = -1.98412633515623692105969699817081e-04;
+ S3 = 2.75552591873811586009688362475245e-06;
+ S4 = -2.47545996176987174320511533257699e-08;
+ y = x * x;
+ return x * (1.0 + y * (S0 + y * (S1 + y * (S2 + y * (S3 + y * S4)))));
+}
+
+/* cos (x) = 1+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))),
+ where 2^-5<=|x|<Pi/4. */
+static inline double
+chebyshev_cos_polynomial_1 (double x)
+{
+ double C0, C1, C2, C3, C4, y;
+
+ C0 = -4.99999999994893751242841517523630e-01;
+ C1 = 4.16666665534264832326805105822132e-02;
+ C2 = -1.38888806593809050610177635576292e-03;
+ C3 = 2.47989607241011055654977823792251e-05;
+ C4 = -2.71747891329266278019784327732444e-07;
+ y = x * x;
+ return 1.0 + y * (C0 + y * (C1 + y * (C2 + y * (C3 + y * C4))));
+}
+
+/* sin (x) = x+x^3*(SS0+x^2*SS1),
+ where 2^-27<=|x|<2^-5. */
+static inline double
+chebyshev_sin_polynomial_2 (double x)
+{
+ double SS0, SS1, y;
+
+ SS0 = -1.66666666634829235826842364076583e-01;
+ SS1 = 8.33312019844746100505350483445000e-03;
+ y = x * x;
+ return x * (1.0 + y * (SS0 + y * SS1));
+}
+
+/* cos (x) = 1+x^2*(CC0+x^2*CC1),
+ where 2^-27<=|x|<2^-5. */
+static inline double
+chebyshev_cos_polynomial_2 (double x)
+{
+ double CC0, CC1, y;
+
+ CC0 = -4.99999999406199269191830580894020e-01;
+ CC1 = 4.16647402420742621331761768033175e-02;
+ y = x * x;
+ return 1.0 + y * (CC0 + y * CC1);
+}
+
+#endif
new file mode 100644
@@ -0,0 +1,142 @@
+/* Range reduction functions for sin and cos inputs
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef __REDUCE_H__
+#define __REDUCE_H__
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+
+#define I_SPABS_MASK 0x7fffffff
+#define I_SPABS_2POWN27 0x32000000
+#define I_SPABS_2POWN5 0x3d000000
+#define I_SPABS_PIO4 0x3f490fdb
+#define I_SPABS_9PIO4 0x40e231d6
+#define I_SPABS_2POW23 0x4b000000
+#define I_SPABS_INF 0x7f800000
+
+/* 4/Pi broken into sum of positive DP values. */
+static double INVPIO4_TABLE[25] = {
+ 0.00000000000000000000000000000000e+00,
+ 1.27323953807353973388671875000000e+00,
+ 6.66162294771233121082332218065858e-09,
+ 4.55202009562027200395410188316081e-18,
+ 5.05365203780056430379174094616698e-26,
+ 3.33657353140390501092355175630980e-34,
+ 2.31774265657771014271759915567725e-43,
+ 1.41079183488085906188916890478151e-51,
+ 1.78201357714620429447917285584916e-59,
+ 6.45440934111020426845713721490652e-68,
+ 2.96289605657163538186678664280422e-77,
+ 2.34290278673081796193027756956770e-85,
+ 6.89165747744598758512920848666612e-94,
+ 2.61827895738527799147711877424548e-102,
+ 5.22516501694879285329083609946870e-111,
+ 2.31723558129677581012126324525784e-119,
+ 5.36762980505877895920512518100769e-128,
+ 2.49914273179058265651805723374739e-135,
+ 3.32028340088181692034775714274009e-144,
+ 1.98261407071607720791943444409774e-152,
+ 1.34423330943468258263688817309715e-162,
+ 2.61360711509865349546753597258351e-169,
+ 2.26640747502086603170125306310953e-178,
+ 2.39096791372273354372455558796085e-186,
+ 2.14336400443697767564443045577643e-194
+};
+
+/* Range Reduction for Pi/4<=|x|<Inf
+
+ Returns n and y, where
+ y = |x| - j * Pi/4,
+ j = (k + 1) & 0xfffffffe,
+ n = (k + 1) & 0x7,
+ k = trunc (|x| / (Pi / 4)).
+
+ In other words,
+ n is the pi/4 octant that x falls into in a trignometric plane.
+ y is the difference between x and the multiple of pi/2 closest to x. */
+
+static int
+reducef (float x, double *y)
+{
+ double INVPIO4, PIO4, PIO4HI, PIO4LO;
+ uint64_t n, ix;
+ double t;
+
+ INVPIO4 = 1.27323954473516276486577680771006e+00;
+ PIO4 = 7.85398163397448278999490867136046e-01;
+ PIO4HI = -7.85398162901401519775390625000000e-01;
+ PIO4LO = -4.96046789840270212596747252887163e-10;
+
+ GET_FLOAT_WORD (ix, x);
+ ix = ix & I_SPABS_MASK;
+
+ if (ix < I_SPABS_9PIO4)
+ {
+ /* Here if Pi/4<=|x|<9*Pi/4. */
+ double j;
+
+ t = fabs (x);
+ n = t * INVPIO4 + 1.0;
+ j = n & 0x0e;
+ t = t - j * PIO4;
+ }
+ else if (ix < I_SPABS_2POW23)
+ {
+ /* Here if 9*Pi/4<=|x|<2^23. */
+ double j;
+
+ t = fabs (x);
+ n = t * INVPIO4 + 1.0;
+ j = n & 0xfffffffe;
+ t = t + j * PIO4HI + j * PIO4LO;
+ }
+ else
+ {
+ /* Here if 2^23<=|x|<=Inf. */
+ double tmp0, tmp1, tmp2, tmp3;
+ uint64_t bitpos, j;
+
+ bitpos = (ix >> 23) - 0x7f + 59;
+ t = fabs (x);
+ j = (bitpos * ((0x100000000 / 28) + 1)) >> 32;
+ tmp0 = INVPIO4_TABLE[j - 2] * t;
+ tmp1 = INVPIO4_TABLE[j - 1] * t;
+ tmp2 = INVPIO4_TABLE[j] * t;
+ tmp3 = INVPIO4_TABLE[j + 1] * t;
+ tmp0 = tmp0 - ((uint64_t)tmp0 & ~0x7);
+ n = tmp0 + tmp1;
+ tmp0 = tmp0 - n;
+ t = tmp0 + tmp1 + tmp2 + tmp3;
+ if (n & 0x1)
+ t = t - 1.0;
+ else if (t > 1.0)
+ {
+ t = t - 2.0;
+ n = n + 1;
+ }
+ n = n + 1;
+ t = t * PIO4;
+ }
+ *y = t;
+
+ return n & 0x7;
+}
+
+#endif
new file mode 100644
@@ -0,0 +1,110 @@
+/* Optimized version of cosf
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include "chebyshev.h"
+#include "reduce.h"
+
+/* Short algorithm description:
+
+ 1) if |x| < 2^-27: return 1.0-|x|.
+ 2) if |x| < 2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
+ 3) if |x| < Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
+ 4) if |x| < Inf:
+ 5.1) Range reduction: k=trunc (|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1,
+ t=|x|-j*Pi/4.
+ 5.2) Reconstruction:
+ s = (-1.0)^(((n+2)>>2)&1)
+ if ((n+2)&2 != 0)
+ {
+ using cos (t) polynomial for |t|<Pi/4, result is
+ s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ }
+ else
+ {
+ using sin (t) polynomial for |t|<Pi/4, result is
+ s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ }
+ 5) if x is Inf : return x-x, and set errno=EDOM.
+ 6) if x is NaN : return x-x.
+
+ Special cases:
+ cos (+-0) = 1 not raising inexact,
+ cos (subnormal) raises inexact,
+ cos (min_normalized) raises inexact,
+ cos (normalized) raises inexact,
+ cos (Inf) = NaN, raises invalid, sets errno to EDOM,
+ cos (NaN) = NaN. */
+
+float
+__cosf (float x)
+{
+ uint32_t ix;
+
+ GET_FLOAT_WORD (ix, x);
+ ix &= I_SPABS_MASK;
+
+ if (ix >= I_SPABS_PIO4)
+ goto large_args;
+
+ if (ix < I_SPABS_2POWN5)
+ goto small_args;
+
+ /* Here if 2^-5<=|x|<Pi/4. */
+ return chebyshev_cos_polynomial_1 (x);
+
+large_args:
+ if (ix < I_SPABS_INF)
+ {
+ /* Here if Pi/4<=|x|<Inf. */
+ double t, sign, val;
+ int n;
+
+ n = reducef (x, &t);
+
+ if (((n + 2) >> 2) & 0x1)
+ sign = -1.0;
+ else
+ sign = 1.0;
+
+ if ((n + 2) & 0x2)
+ val = chebyshev_cos_polynomial_1 (t);
+ else
+ val = chebyshev_sin_polynomial_1 (t);
+
+ return sign * val;
+ }
+ /* Here if x is Inf or Nan. */
+ if (ix == I_SPABS_INF)
+ __set_errno (EDOM);
+ return x - x;
+
+small_args:
+ if (ix >= I_SPABS_2POWN27)
+ {
+ /* Here if 2^-27<=|x|<2^-5. */
+ return chebyshev_cos_polynomial_2 (x);
+ }
+
+ /* Here if |x| < 2^-27. */
+ return 1.0 - fabsf (x);
+}
+
+weak_alias (__cosf, cosf);
new file mode 100644
@@ -0,0 +1,120 @@
+/* Optimized version of sinf
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <errno.h>
+#include <math.h>
+#include <math_private.h>
+#include "chebyshev.h"
+#include "reduce.h"
+
+/* Short algorithm description:
+
+ 1) if |x| == 0 : return x.
+ 2) if |x| < 2^-27: return x-x*DP_SMALL, raise underflow only when needed.
+ 3) if |x| < 2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1.
+ 4) if |x| < Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+ 5) if |x| < Inf:
+ 5.1) Range reduction: k=trunc (|x|/(Pi/4)), j=(k+1)&0x0ffffffe, n=k+1,
+ t=|x|-j*Pi/4.
+ 5.2) Reconstruction:
+ s = sign (x) * (-1.0)^((n>>2)&1)
+ if (n&2 != 0)
+ {
+ using cos (t) polynomial for |t|<Pi/4, result is
+ s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
+ }
+ else
+ {
+ using sin (t) polynomial for |t|<Pi/4, result is
+ s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
+ }
+ 6) if x is Inf : return x-x, and set errno=EDOM.
+ 7) if x is NaN : return x-x.
+
+ Special cases:
+ sin (+-0) = +-0 not raising inexact/underflow,
+ sin (subnormal) raises inexact/underflow,
+ sin (min_normalized) raises inexact/underflow,
+ sin (normalized) raises inexact,
+ sin (Inf) = NaN, raises invalid, sets errno to EDOM,
+ sin (NaN) = NaN. */
+
+float
+__sinf (float x)
+{
+ uint32_t ix, sign_x;
+
+ GET_FLOAT_WORD (ix, x);
+ sign_x = ix >> 31;
+ ix &= I_SPABS_MASK;
+
+ if (ix >= I_SPABS_PIO4)
+ goto large_args;
+
+ if (ix < I_SPABS_2POWN5)
+ goto small_args;
+
+ /* Here if 2^-5<=|x|<Pi/4. */
+ return chebyshev_sin_polynomial_1 (x);
+
+large_args:
+ if (ix < I_SPABS_INF)
+ {
+ /* Here if Pi/4<=|x|<Inf. */
+ double t, sign, val;
+ int n;
+
+ n = reducef (x, &t);
+
+ if ((sign_x ^ (n >> 2)) & 0x1)
+ sign = -1.0;
+ else
+ sign = 1.0;
+
+ if (n & 0x2)
+ val = chebyshev_cos_polynomial_1 (t);
+ else
+ val = chebyshev_sin_polynomial_1 (t);
+
+ return sign * val;
+ }
+
+ /* Here if x is Inf or Nan. */
+ if (ix == I_SPABS_INF)
+ __set_errno (EDOM);
+ return x - x;
+
+small_args:
+ if (ix >= I_SPABS_2POWN27)
+ {
+ /* Here if 2^-27<=|x|<2^-5. */
+ return chebyshev_sin_polynomial_2 (x);
+ }
+ else if (ix > 0)
+ {
+ /* Here if 0<=|x|<2^-27. */
+ double SMALL = 8.88178419700125232338905334472656e-16;
+
+ return x - x * SMALL;
+ }
+
+ /* Here if |x| == 0. */
+ return x;
+}
+
+weak_alias (__sinf, sinf);