[v6,1/4] Fix the inaccuracy of j0f and of y0f [BZ #14469 and #14471]
Commit Message
For both j0f and y0f, the largest error for all binary32
inputs is reduced to at most 9 ulps for all rounding modes.
The new code is enabled only when there is a cancellation at the very end of
the j0f/y0f computation, or for very large inputs, thus should not give any
visible slowdown on average. Two different algorithms are used:
* around the first 64 zeros of j0/y0, approximation polynomials of degree 3
are used, computed using the Sollya tool (https://www.sollya.org/)
* for large inputs, an asymptotic formula from [1] is used
[1] Fast and Accurate Bessel Function Computation,
John Harrison, Proceedings of Arith 19, 2009.
Tested on x86_64 with --disable-multi-arch and on powerpc64le-linux-gnu.
---
sysdeps/ieee754/flt-32/e_j0f.c | 418 +++++++++++++++++++++++++++++----
1 file changed, 378 insertions(+), 40 deletions(-)
Comments
On 01/03/2021 05:20, Paul Zimmermann wrote:
> For both j0f and y0f, the largest error for all binary32
> inputs is reduced to at most 9 ulps for all rounding modes.
>
> The new code is enabled only when there is a cancellation at the very end of
> the j0f/y0f computation, or for very large inputs, thus should not give any
> visible slowdown on average. Two different algorithms are used:
>
> * around the first 64 zeros of j0/y0, approximation polynomials of degree 3
> are used, computed using the Sollya tool (https://www.sollya.org/)
>
> * for large inputs, an asymptotic formula from [1] is used
>
> [1] Fast and Accurate Bessel Function Computation,
> John Harrison, Proceedings of Arith 19, 2009.
>
> Tested on x86_64 with --disable-multi-arch and on powerpc64le-linux-gnu.
> ---
> sysdeps/ieee754/flt-32/e_j0f.c | 418 +++++++++++++++++++++++++++++----
> 1 file changed, 378 insertions(+), 40 deletions(-)
>
> diff --git a/sysdeps/ieee754/flt-32/e_j0f.c b/sysdeps/ieee754/flt-32/e_j0f.c
> index 5d29611eb7..1a4e152139 100644
> --- a/sysdeps/ieee754/flt-32/e_j0f.c
> +++ b/sysdeps/ieee754/flt-32/e_j0f.c
> @@ -16,7 +16,9 @@
> #include <math.h>
> #include <math-barriers.h>
> #include <math_private.h>
> +#include <fenv_private.h>
> #include <libm-alias-finite.h>
> +#include <s_sincosf.h>
>
> static float pzerof(float), qzerof(float);
>
> @@ -37,6 +39,190 @@ S04 = 1.1661400734e-09; /* 0x30a045e8 */
>
> static const float zero = 0.0;
>
> +#define FIRST_ZERO_J0 2.404825f
For newer code we use hexadecimal floating-point constants. For this
specific one I think is better to use directly and add a comments of
its meaning (it is used only once).
> +
> +#define SMALL_SIZE 64
> +
> +/* The following table contains successive zeros of j0 and degree-3 polynomial
> + approximations of j0 around these zeros: Pj[0] for the first zero (2.40482),
Line too long.
> + Pj[1] for the second one (5.520078), and so on. Each line contains:
> + {x0, xmid, x1, p0, p1, p2, p3}
> + where [x0,x1] is the interval around the zero, xmid is the binary32 number
> + closest to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation
> + polynomial. Each polynomial was generated using Sollya on the interval
> + [x0,x1] around the corresponding zero where the error is larger than 9 ulps
> + for the alternate code. Degree 3 is enough to get an error <= 9 ulps.
> +*/
> +static const float Pj[SMALL_SIZE][7] = {
> + /* The following polynomial was optimized by hand with respect to the one
> + generated by Sollya, to ensure the maximal error is at most 9 ulps,
> + both if the polynomial is evaluated with fma or not. */
> + { 0x1.31e5c4p+1, 0x1.33d152p+1, 0x1.3b58dep+1, 0xf.2623fp-28, -0x8.4e6d7p-4, 0x1.ba2aaap-4, 0xe.4b9ap-8 }, /* 0 */
> + { 0x1.60eafap+2, 0x1.6148f6p+2, 0x1.62955cp+2, 0x6.9205fp-28, 0x5.71b98p-4, -0x7.e3e798p-8, -0xd.87d1p-8 }, /* 1 */
> + { 0x1.14cde2p+3, 0x1.14eb56p+3, 0x1.1525c6p+3, 0x1.bcc1cap-24, -0x4.57de6p-4, 0x4.03e7cp-8, 0xb.39a37p-8 }, /* 2 */
> + { 0x1.7931d8p+3, 0x1.79544p+3, 0x1.7998d6p+3, -0xf.2976fp-32, 0x3.b827ccp-4, -0x2.8603ep-8, -0x9.bf49bp-8 }, /* 3 */
> + { 0x1.ddb6d4p+3, 0x1.ddca14p+3, 0x1.ddf0c8p+3, -0x1.bd67d8p-28, -0x3.4e03ap-4, 0x1.c562a2p-8, 0x8.90ec2p-8 }, /* 4 */
> + { 0x1.2118e4p+4, 0x1.212314p+4, 0x1.21375p+4, 0x1.62209cp-28, 0x3.00efecp-4, -0x1.5458dap-8, -0x8.10063p-8 }, /* 5 */
> + { 0x1.535d28p+4, 0x1.5362dep+4, 0x1.536e48p+4, -0x2.853f74p-24, -0x2.c5b274p-4, 0x1.0b9db4p-8, 0x7.8c3578p-8 }, /* 6 */
> + { 0x1.859ddp+4, 0x1.85a3bap+4, 0x1.85aff4p+4, 0x2.19ed1cp-24, 0x2.96545cp-4, -0xd.997e6p-12, -0x6.d9af28p-8 }, /* 7 */
> + { 0x1.b7decap+4, 0x1.b7e54ap+4, 0x1.b7f038p+4, 0xe.959aep-28, -0x2.6f5594p-4, 0xb.538dp-12, 0x7.003ea8p-8 }, /* 8 */
> + { 0x1.ea21c6p+4, 0x1.ea275ap+4, 0x1.ea337ap+4, 0x2.0c3964p-24, 0x2.4e80fcp-4, -0x9.a2216p-12, -0x6.61e0a8p-8 }, /* 9 */
> + { 0x1.0e3316p+5, 0x1.0e34e2p+5, 0x1.0e379ap+5, -0x3.642554p-24, -0x2.325e48p-4, 0x8.4f49cp-12, 0x7.d37c3p-8 }, /* 10 */
> + { 0x1.275456p+5, 0x1.275638p+5, 0x1.2759e2p+5, 0x1.6c015ap-24, 0x2.19e7d8p-4, -0x7.4c1bf8p-12, -0x4.af7ef8p-8 }, /* 11 */
> + { 0x1.4075ecp+5, 0x1.4077a8p+5, 0x1.407b96p+5, -0x4.b18c9p-28, -0x2.046174p-4, 0x6.705618p-12, 0x5.f2d28p-8 }, /* 12 */
> + { 0x1.59973p+5, 0x1.59992cp+5, 0x1.599b2ap+5, -0x1.8b8792p-24, 0x1.f13fbp-4, -0x5.c14938p-12, -0x5.73e0cp-8 }, /* 13 */
> + { 0x1.72b958p+5, 0x1.72bacp+5, 0x1.72bc5ap+5, 0x3.a26e0cp-24, -0x1.e018dap-4, 0x5.30e8dp-12, 0x2.81099p-8 }, /* 14 */
> + { 0x1.8bdb4ap+5, 0x1.8bdc62p+5, 0x1.8bde7ep+5, -0x2.18fabcp-24, 0x1.d09b22p-4, -0x4.b0b688p-12, -0x5.5fd308p-8 }, /* 15 */
> + { 0x1.a4fcecp+5, 0x1.a4fe0ep+5, 0x1.a50042p+5, 0x3.2370e8p-24, -0x1.c28614p-4, 0x4.4647e8p-12, 0x5.68a28p-8 }, /* 16 */
> + { 0x1.be1ebcp+5, 0x1.be1fc4p+5, 0x1.be21fp+5, -0x5.9eae3p-28, 0x1.b5a622p-4, -0x3.eb9054p-12, -0x5.12d8cp-8 }, /* 17 */
> + { 0x1.d7405p+5, 0x1.d7418p+5, 0x1.d74294p+5, 0x2.9fa1e8p-24, -0x1.a9d184p-4, 0x3.9d1e7p-12, 0x4.33d058p-8 }, /* 18 */
> + { 0x1.f0624p+5, 0x1.f06344p+5, 0x1.f0645ep+5, 0x9.9ac67p-28, 0x1.9ee5eep-4, -0x3.5816e8p-12, -0x2.6e5004p-8 }, /* 19 */
> + { 0x1.04c22ep+6, 0x1.04c286p+6, 0x1.04c316p+6, 0xd.6ab94p-28, -0x1.94c6f6p-4, 0x3.174efcp-12, 0x7.9a092p-8 }, /* 20 */
> + { 0x1.1153p+6, 0x1.11536cp+6, 0x1.11541p+6, -0x4.4cb2d8p-24, 0x1.8b5cccp-4, -0x2.e3c238p-12, -0x4.e5437p-8 }, /* 21 */
> + { 0x1.1de3d8p+6, 0x1.1de456p+6, 0x1.1de4dap+6, -0x4.4aa8c8p-24, -0x1.829356p-4, 0x2.b45124p-12, 0x5.baf638p-8 }, /* 22 */
> + { 0x1.2a74f8p+6, 0x1.2a754p+6, 0x1.2a75bp+6, 0x2.077c38p-24, 0x1.7a597ep-4, -0x2.8a0414p-12, -0x2.838d3p-8 }, /* 23 */
> + { 0x1.3705d4p+6, 0x1.37062cp+6, 0x1.3706b2p+6, -0x2.6a6cd8p-24, -0x1.72a09ap-4, 0x2.623a3cp-12, 0x5.5256a8p-8 }, /* 24 */
> + { 0x1.4396dp+6, 0x1.439718p+6, 0x1.43976ep+6, -0x5.08287p-24, 0x1.6b5c06p-4, -0x2.3da154p-12, -0x7.a2254p-8 }, /* 25 */
> + { 0x1.5027acp+6, 0x1.502808p+6, 0x1.50288cp+6, -0x3.4598dcp-24, -0x1.6480c4p-4, 0x2.1cb944p-12, 0x7.27c77p-8 }, /* 26 */
> + { 0x1.5cb89ap+6, 0x1.5cb8f8p+6, 0x1.5cb97ep+6, 0x5.4e74bp-24, 0x1.5e0544p-4, -0x2.00b158p-12, -0x5.9bc4a8p-8 }, /* 27 */
> + { 0x1.69498cp+6, 0x1.6949e8p+6, 0x1.694a42p+6, -0x2.05751cp-24, -0x1.57e12p-4, 0x1.e78edcp-12, 0x9.9667dp-8 }, /* 28 */
> + { 0x1.75da7ep+6, 0x1.75dadap+6, 0x1.75db3p+6, 0x4.c5e278p-24, 0x1.520ceep-4, -0x1.d0127cp-12, -0xd.62681p-8 }, /* 29 */
> + { 0x1.826b7ep+6, 0x1.826bccp+6, 0x1.826c2cp+6, -0x3.50e62cp-24, -0x1.4c822p-4, 0x1.ba5832p-12, -0x1.eb2ee2p-8 }, /* 30 */
> + { 0x1.8efc84p+6, 0x1.8efcbep+6, 0x1.8efd16p+6, -0x1.c39f38p-24, 0x1.473ae6p-4, -0x1.a616c8p-12, 0xf.f352ap-12 }, /* 31 */
> + { 0x1.9b8d84p+6, 0x1.9b8db2p+6, 0x1.9b8e7p+6, -0x1.9245b6p-28, -0x1.42320ap-4, 0x1.932a04p-12, 0x2.dc113cp-8 }, /* 32 */
> + { 0x1.a81e72p+6, 0x1.a81ea6p+6, 0x1.a81f04p+6, -0x1.0acf8p-24, 0x1.3d62e6p-4, -0x1.7c4b14p-12, -0x1.cfc5c2p-4 }, /* 33 */
> + { 0x1.b4af6ap+6, 0x1.b4af9ap+6, 0x1.b4afeep+6, 0x4.cd92d8p-24, -0x1.38c94ap-4, 0x1.643154p-12, 0x1.4c2a06p-4 }, /* 34 */
> + { 0x1.c1406p+6, 0x1.c1409p+6, 0x1.c140cp+6, -0x1.37bf8ap-24, 0x1.34617p-4, -0x1.5f504ap-12, -0x1.e2d324p-4 }, /* 35 */
> + { 0x1.cdd154p+6, 0x1.cdd186p+6, 0x1.cdd1eap+6, -0x1.8f62dep-28, -0x1.3027fp-4, 0x1.534a02p-12, 0x2.c7f144p-12 }, /* 36 */
> + { 0x1.da6248p+6, 0x1.da627cp+6, 0x1.da62e6p+6, -0x9.81e79p-28, 0x1.2c19b4p-4, -0x1.4b8288p-12, 0x7.2d8bap-8 }, /* 37 */
> + { 0x1.e6f33ep+6, 0x1.e6f372p+6, 0x1.e6f3a8p+6, 0x3.103b3p-24, -0x1.2833eep-4, 0x1.36f4d2p-12, 0x9.29f91p-8 }, /* 38 */
> + { 0x1.f38434p+6, 0x1.f3846ap+6, 0x1.f384d8p+6, 0x2.07b058p-24, 0x1.24740ap-4, -0x1.2ee58ap-12, 0xd.f1393p-12 }, /* 39 */
> + { 0x1.000a98p+7, 0x1.000abp+7, 0x1.000ac8p+7, 0x3.87576cp-24, -0x1.20d7b6p-4, 0x1.2083e2p-12, 0x3.9a7aap-8 }, /* 40 */
> + { 0x1.06531p+7, 0x1.06532cp+7, 0x1.065348p+7, -0x1.691ecp-24, 0x1.1d5ccap-4, -0x1.166726p-12, -0x1.e4af48p-8 }, /* 41 */
> + { 0x1.0c9b9ap+7, 0x1.0c9ba8p+7, 0x1.0c9bbep+7, 0x9.b406dp-28, -0x1.1a015p-4, 0x1.038f9cp-12, -0x4.021058p-4 }, /* 42 */
> + { 0x1.12e412p+7, 0x1.12e424p+7, 0x1.12e436p+7, -0xf.bfd8fp-28, 0x1.16c37ap-4, -0x1.039edep-12, 0x1.f0033p-4 }, /* 43 */
> + { 0x1.192c92p+7, 0x1.192cap+7, 0x1.192cb6p+7, 0x2.6d50c8p-24, -0x1.13a19ep-4, 0xf.9df8ap-16, 0x4.ecd978p-8 }, /* 44 */
> + { 0x1.1f7512p+7, 0x1.1f751cp+7, 0x1.1f753ap+7, -0x4.d475c8p-24, 0x1.109a32p-4, -0x1.04fb3ap-12, -0xd.c271p-12 }, /* 45 */
> + { 0x1.25bd8ep+7, 0x1.25bd98p+7, 0x1.25bdap+7, 0x8.1982p-24, -0x1.0dabc8p-4, 0xe.88eabp-16, -0x4.ed75dp-4 }, /* 46 */
> + { 0x1.2c060cp+7, 0x1.2c0616p+7, 0x1.2c0644p+7, 0x4.864518p-24, 0x1.0ad51p-4, -0xe.27196p-16, 0xb.97a3ep-8 }, /* 47 */
> + { 0x1.324e86p+7, 0x1.324e92p+7, 0x1.324e9ep+7, 0x6.8917a8p-28, -0x1.0814d4p-4, 0xd.4fe7ep-16, -0x6.8d8d6p-4 }, /* 48 */
> + { 0x1.389702p+7, 0x1.38970ep+7, 0x1.389728p+7, -0x5.fa18fp-24, 0x1.0569fp-4, -0xd.5b0d4p-16, 0x1.50353ap-4 }, /* 49 */
> + { 0x1.3edf84p+7, 0x1.3edf8cp+7, 0x1.3edfaap+7, -0x4.0e5c98p-24, -0x1.02d354p-4, 0xb.7b255p-16, 0x7.8a916p-4 }, /* 50 */
> + { 0x1.4527fp+7, 0x1.452808p+7, 0x1.452812p+7, -0x2.c3ddbp-24, 0x1.005004p-4, -0xd.7729cp-16, -0x3.bcc354p-8 }, /* 51 */
> + { 0x1.4b7076p+7, 0x1.4b7086p+7, 0x1.4b70a4p+7, -0x5.d052p-24, -0xf.ddf16p-8, 0xc.318c1p-16, 0x5.7947p-8 }, /* 52 */
> + { 0x1.51b8f4p+7, 0x1.51b902p+7, 0x1.51b90ep+7, -0x2.0b97dcp-24, 0xf.b7fafp-8, -0xc.1429dp-16, -0x3.43c36p-4 }, /* 53 */
> + { 0x1.580168p+7, 0x1.58018p+7, 0x1.580188p+7, -0x5.4aab5p-24, -0xf.930fep-8, 0xa.ecc24p-16, 0x9.c62cdp-12 }, /* 54 */
> + { 0x1.5e49eap+7, 0x1.5e49fcp+7, 0x1.5e4a12p+7, -0x3.6dadd8p-24, 0xf.6f245p-8, -0xb.6816cp-16, 0xa.d731ap-8 }, /* 55 */
> + { 0x1.649272p+7, 0x1.64927ap+7, 0x1.64929p+7, -0x2.d7e038p-24, -0xf.4c2cep-8, 0xb.118bep-16, 0xb.69a4ep-8 }, /* 56 */
> + { 0x1.6adae6p+7, 0x1.6adaf6p+7, 0x1.6adb04p+7, -0x6.977a1p-24, 0xf.2a1fp-8, -0xa.a8911p-16, -0x4.bf6d2p-8 }, /* 57 */
> + { 0x1.712366p+7, 0x1.712374p+7, 0x1.71238ep+7, 0x1.3cc95ep-24, -0xf.08f0ap-8, 0x9.f0858p-16, 0x1.77f7f4p-4 }, /* 58 */
> + { 0x1.776beap+7, 0x1.776bf2p+7, 0x1.776bfap+7, 0x3.a4921p-24, 0xe.e8986p-8, -0xa.39dfp-16, -0x6.7ba3dp-4 }, /* 59 */
> + { 0x1.7db464p+7, 0x1.7db46ep+7, 0x1.7db476p+7, 0x6.b45a7p-24, -0xe.c90d8p-8, 0xa.e586fp-16, -0x1.d66becp-4 }, /* 60 */
> + { 0x1.83fce2p+7, 0x1.83fcecp+7, 0x1.83fd0ep+7, -0x2.8f34a4p-24, 0xe.aa478p-8, -0x9.810bp-16, -0x3.a5f3fcp-8 }, /* 61 */
> + { 0x1.8a455cp+7, 0x1.8a456ap+7, 0x1.8a4588p+7, -0x1.325968p-24, -0xe.8c3eap-8, 0x9.0a765p-16, 0x1.29a54ap-4 }, /* 62 */
> + { 0x1.908dd8p+7, 0x1.908de8p+7, 0x1.908df4p+7, 0x4.96b808p-24, 0xe.6eeb5p-8, -0x9.0251bp-16, 0x1.41a488p-4 }, /* 63 */
> +};
Please align it on a the 78 column size, something like:
--
static const float Pj[SMALL_SIZE][7] =
{
/* The following polynomial was optimized by hand with respect to the one
generated by Sollya, to ensure the maximal error is at most 9 ulps,
both if the polynomial is evaluated with fma or not. */
{ 0x1.31e5c4p+1, 0x1.33d152p+1, 0x1.3b58dep+1, 0xf.2623fp-28,
-0x8.4e6d7p-4, 0x1.ba2aaap-4, 0xe.4b9ap-8 }, /* 0 */
--
It adds a 1792 byte data table, but it should be fine since
it fix a long standing issue.
> +
> +/* Return h and update n such that:
> + Now x - pi/4 - alpha0 = h + n*pi/2 mod (2*pi). */
> +static double
> +reduce_aux (float x, int *n, double alpha0)
> +{
> + double h;
> + h = reduce_large (asuint (x), n);
> + /* Now |x| = h+n*pi/2 mod 2*pi. */
> + /* Recover sign. */
Double space after period. There is some other occurrences as well.
> + if (x < 0)
> + {
> + h = -h;
> + *n = -*n;
> + }
> + /* Subtract pi/4. */
> + double piover2 = 0xc.90fdaa22168cp-3;
> + if (h >= 0)
> + h -= piover2 / 2;
> + else
> + {
> + h += piover2 / 2;
> + (*n) --;
> + }
> + /* Subtract alpha0 and reduce if needed mod pi/2. */
> + h -= alpha0;
> + if (h > piover2)
> + {
> + h -= piover2;
> + (*n) ++;
> + }
> + else if (h < -piover2)
> + {
> + h += piover2;
> + (*n) --;
> + }
> + return h;
> +}
> +
> +/* Formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf:
> + j0f(x) ~ sqrt(2/(pi*x))*beta0(x)*cos(x-pi/4-alpha0(x))
> + where beta0(x) = 1 - 1/(16*x^2) + 53/(512*x^4)
> + and alpha0(x) = 1/(8*x) - 25/(384*x^3). */
> +static float
> +j0f_asympt (float x)
> +{
> + /* The following code fails to give an error <= 9 ulps in only two cases,
> + for which we tabulate the result. */
> + if (x == 0x1.4665d2p+24f)
> + return 0xa.50206p-52f;
> + if (x == 0x1.a9afdep+7f)
> + return 0xf.47039p-28f;
> + double y = 1.0 / (double) x;
> + double y2 = y * y;
> + double beta0 = 1.0f + y2 * (-0x1p-4f + 0x1.a8p-4 * y2);
> + double alpha0 = y * (0x2p-4 - 0x1.0aaaaap-4 * y2);
> + double h;
> + int n;
> + h = reduce_aux (x, &n, alpha0);
> + /* Now x - pi/4 - alpha0 = h + n*pi/2 mod (2*pi). */
> + float xr = (float) h;
> + n = n & 3;
> + float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */
> + float t = cst / sqrtf (x) * (float) beta0;
> + if (n == 0)
> + return t * cosf (xr);
> + else if (n == 2) /* cos(x+pi) = -cos(x) */
> + return -t * cosf (xr);
> + else if (n == 1) /* cos(x+pi/2) = -sin(x) */
> + return -t * sinf (xr);
> + else /* cos(x+3pi/2) = sin(x) */
> + return t * sinf (xr);
This will generate an necessary intral PLT call, use the double underscore
internal symbols instead (__cosf, __sinf).
> +}
> +
> +/* Special code for x near a root of j0.
> + z is the value computed by the generic code.
> + For small x, we use a polynomial approximating j0 around its root.
> + For large x, we use an asymptotic formula (j0f_asympt). */
> +static float
> +j0f_near_root (float x, float z)
> +{
> + float index_f;
> + int index;
> +
> + index_f = roundf ((x - FIRST_ZERO_J0) / (float) M_PI);
> + /* j0f_asympt fails to give an error <= 9 ulps for x=0x1.324e92p+7 (index 48)
Line too long.
> + thus we can't reduce SMALL_SIZE below 49. */
> + if (index_f >= SMALL_SIZE)
> + return j0f_asympt (x);
> + index = (int) index_f;
> + const float *p = Pj[index];
> + float x0 = p[0];
> + float x1 = p[2];
> + /* If not in the interval [x0,x1] around xmid, we return the value z. */
> + if (! (x0 <= x && x <= x1))
> + return z;
> + float xmid = p[1];
> + float y = x - xmid;
> + return p[3] + y * (p[4] + y * (p[5] + y * p[6]));
> +}
> +
> float
> __ieee754_j0f(float x)
> {
> @@ -48,39 +234,34 @@ __ieee754_j0f(float x)
> if(ix>=0x7f800000) return one/(x*x);
> x = fabsf(x);
> if(ix >= 0x40000000) { /* |x| >= 2.0 */
> + SET_RESTORE_ROUNDF (FE_TONEAREST);
> __sincosf (x, &s, &c);
> ss = s-c;
> cc = s+c;
> - if(ix<0x7f000000) { /* make sure x+x not overflow */
> - z = -__cosf(x+x);
> - if ((s*c)<zero) cc = z/ss;
> - else ss = z/cc;
> - } else {
> - /* We subtract (exactly) a value x0 such that
> - cos(x0)+sin(x0) is very near to 0, and use the identity
> - sin(x-x0) = sin(x)*cos(x0)-cos(x)*sin(x0) to get
> - sin(x) + cos(x) with extra accuracy. */
> - float x0 = 0xe.d4108p+124f;
> - float y = x - x0; /* exact */
> - /* sin(y) = sin(x)*cos(x0)-cos(x)*sin(x0) */
> - z = __sinf (y);
> - float eps = 0x1.5f263ep-24f;
> - /* cos(x0) ~ -sin(x0) + eps */
> - z += eps * __cosf (x);
> - /* now z ~ (sin(x)-cos(x))*cos(x0) */
> - float cosx0 = -0xb.504f3p-4f;
> - cc = z / cosx0;
> - }
> + if (ix >= 0x7f000000) /* x >= 2^127: use asymptotic expansion. */
> + return j0f_asympt (x);
> + /* Now we are sure that x+x cannot overflow. */
> + z = -__cosf(x+x);
> + if ((s*c)<zero) cc = z/ss;
> + else ss = z/cc;
> /*
> * j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x)
> * y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x)
> */
> - if(ix>0x5c000000) z = (invsqrtpi*cc)/sqrtf(x);
> - else {
> - u = pzerof(x); v = qzerof(x);
> - z = invsqrtpi*(u*cc-v*ss)/sqrtf(x);
> - }
> - return z;
> + if (ix <= 0x5c000000)
> + {
> + u = pzerof(x); v = qzerof(x);
> + cc = u*cc-v*ss;
> + }
> + z = (invsqrtpi * cc) / sqrtf(x);
> + /* The following threshold is optimal: for x=0x1.3b58dep+1
> + and rounding upwards, |cc|=0x1.579b26p-4 and z is 10 ulps
> + far from the correctly rounded value. */
> + float threshold = 0x1.579b26p-4;
> + if (fabsf (cc) > threshold)
> + return z;
> + else
> + return j0f_near_root (x, z);
> }
> if(ix<0x39000000) { /* |x| < 2**-13 */
> math_force_eval(huge+x); /* raise inexact if x != 0 */
> @@ -112,6 +293,153 @@ v02 = 7.6006865129e-05, /* 0x389f65e0 */
> v03 = 2.5915085189e-07, /* 0x348b216c */
> v04 = 4.4111031494e-10; /* 0x2ff280c2 */
>
> +#define FIRST_ZERO_Y0 0.893576f
> +
> +#define SMALL_SIZE 64
You redefine it on the same file, I think it would be better to either use
only on definition or use different names for 'Pj' and 'Py'.
> +
> +/* The following table contains successive zeros of y0 and degree-3 polynomial
> + approximations of y0 around these zeros: Py[0] for the first zero (0.89358),
> + Py[1] for the second one (3.957678), and so on. Each line contains:
> + {x0, xmid, x1, p0, p1, p2, p3}
> + where [x0,x1] is the interval around the zero, xmid is the binary32 number
> + closest to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation
> + polynomial. Each polynomial was generated using Sollya on the interval
> + [x0,x1] around the corresponding zero where the error is larger than 9 ulps
> + for the alternate code. Degree 3 is enough, except for index 0 where we use
> + degree 5, and the coefficients of degree 4 and 5 are hard-coded in
> + y0f_near_root.
> +*/
> +static const float Py[SMALL_SIZE][7] = {
> + { 0x1.a681dap-1, 0x1.c982ecp-1, 0x1.ef6bcap-1, 0x3.274468p-28, 0xe.121b8p-4, -0x7.df8b3p-4, 0x3.877be4p-4/*, -0x3.a46c9p-4, 0x3.735478p-4*/ }, /* 0 */
> + { 0x1.f957c6p+1, 0x1.fa9534p+1, 0x1.fd11b2p+1, 0xa.f1f83p-28, -0x6.70d098p-4, 0xd.04d48p-8, 0xe.f61a9p-8 }, /* 1 */
> + { 0x1.c51832p+2, 0x1.c581dcp+2, 0x1.c65164p+2, -0x5.e2a51p-28, 0x4.cd3328p-4, -0x5.6bbe08p-8, -0xc.46d8p-8 }, /* 2 */
> + { 0x1.46fd84p+3, 0x1.471d74p+3, 0x1.475bfcp+3, -0x1.4b0aeep-24, -0x3.fec6b8p-4, 0x3.2068a4p-8, 0xa.76e2dp-8 }, /* 3 */
> + { 0x1.ab7afap+3, 0x1.ab8e1cp+3, 0x1.abb294p+3, -0x8.179d7p-28, 0x3.7e6544p-4, -0x2.1799fp-8, -0x9.0e1c4p-8 }, /* 4 */
> + { 0x1.07f9aap+4, 0x1.0803c8p+4, 0x1.08170cp+4, -0x2.5b8078p-24, -0x3.24b868p-4, 0x1.8631ecp-8, 0x8.3cb46p-8 }, /* 5 */
> + { 0x1.3a38eap+4, 0x1.3a42cep+4, 0x1.3a4d8ap+4, 0x1.cd304ap-28, 0x2.e189ecp-4, -0x1.2c6954p-8, -0x7.8178ep-8 }, /* 6 */
> + { 0x1.6c7d42p+4, 0x1.6c833p+4, 0x1.6c99fp+4, -0x6.c63f1p-28, -0x2.acc9a8p-4, 0xf.09e31p-12, 0x7.0b5ab8p-8 }, /* 7 */
> + { 0x1.9ebec4p+4, 0x1.9ec47p+4, 0x1.9ed016p+4, 0x1.e53838p-24, 0x2.81f2p-4, -0xc.5ff51p-12, -0x7.05ep-8 }, /* 8 */
> + { 0x1.d1008ep+4, 0x1.d10644p+4, 0x1.d11262p+4, 0x1.636feep-24, -0x2.5e40dcp-4, 0xa.6f81dp-12, 0x5.ff6p-8 }, /* 9 */
> + { 0x1.01a27cp+5, 0x1.01a442p+5, 0x1.01a924p+5, -0x4.04e1bp-28, 0x2.3febd8p-4, -0x8.f11e2p-12, -0x6.0111ap-8 }, /* 10 */
> + { 0x1.1ac3bcp+5, 0x1.1ac588p+5, 0x1.1ac912p+5, 0x3.6063d8p-24, -0x2.25baacp-4, 0x7.c93cdp-12, 0x4.e7577p-8 }, /* 11 */
> + { 0x1.33e508p+5, 0x1.33e6ecp+5, 0x1.33ea1ap+5, -0x3.f39ebcp-24, 0x2.0ed04cp-4, -0x6.d9434p-12, -0x4.fc0b7p-8 }, /* 12 */
> + { 0x1.4d0666p+5, 0x1.4d0868p+5, 0x1.4d0c14p+5, -0x4.ea23p-28, -0x1.fa8b4p-4, 0x6.1470e8p-12, 0x5.97f71p-8 }, /* 13 */
> + { 0x1.6628b8p+5, 0x1.6629f4p+5, 0x1.662e0ep+5, -0x3.5df0c8p-24, 0x1.e8727ep-4, -0x5.76a038p-12, -0x4.ee37c8p-8 }, /* 14 */
> + { 0x1.7f4a7cp+5, 0x1.7f4b9p+5, 0x1.7f4daap+5, 0x1.1ef09ep-24, -0x1.d8293ap-4, 0x4.ed8a28p-12, 0x4.d43708p-8 }, /* 15 */
> + { 0x1.986c5cp+5, 0x1.986d38p+5, 0x1.986f6p+5, 0x1.b70cecp-24, 0x1.c967p-4, -0x4.7a70b8p-12, -0x5.6840e8p-8 }, /* 16 */
> + { 0x1.b18dcap+5, 0x1.b18ee8p+5, 0x1.b19122p+5, 0x1.abaadcp-24, -0x1.bbf246p-4, 0x4.1a35bp-12, 0x3.c2d46p-8 }, /* 17 */
> + { 0x1.caaf86p+5, 0x1.cab0a2p+5, 0x1.cab2fep+5, 0x1.63989ep-24, 0x1.af9cb4p-4, -0x3.c2f2dcp-12, -0x4.cf8108p-8 }, /* 18 */
> + { 0x1.e3d146p+5, 0x1.e3d262p+5, 0x1.e3d492p+5, -0x1.68a8ecp-24, -0x1.a4407ep-4, 0x3.7733ecp-12, 0x5.97916p-8 }, /* 19 */
> + { 0x1.fcf316p+5, 0x1.fcf428p+5, 0x1.fcf59ap+5, 0x1.e1bb5p-24, 0x1.99be74p-4, -0x3.37210cp-12, -0x5.d19bf8p-8 }, /* 20 */
> + { 0x1.0b0a7cp+6, 0x1.0b0afap+6, 0x1.0b0b9cp+6, -0x5.5bbcfp-24, -0x1.8ffc9ap-4, 0x2.ffe638p-12, 0x2.ed28e8p-8 }, /* 21 */
> + { 0x1.179b66p+6, 0x1.179bep+6, 0x1.179d0ap+6, -0x4.9e34a8p-24, 0x1.86e51cp-4, -0x2.cc7a68p-12, -0x3.3642c4p-8 }, /* 22 */
> + { 0x1.242c5cp+6, 0x1.242ccap+6, 0x1.242d68p+6, 0x1.966706p-24, -0x1.7e657p-4, 0x2.9aed4cp-12, 0x7.b87a58p-8 }, /* 23 */
> + { 0x1.30bd62p+6, 0x1.30bdb6p+6, 0x1.30beb2p+6, 0x3.4b3b68p-24, 0x1.766dc2p-4, -0x2.72651cp-12, -0x3.e347f8p-8 }, /* 24 */
> + { 0x1.3d4e56p+6, 0x1.3d4ea2p+6, 0x1.3d4f2ep+6, 0x6.a99008p-28, -0x1.6ef07ep-4, 0x2.53aec4p-12, 0x2.9e3d88p-12 }, /* 25 */
> + { 0x1.49df38p+6, 0x1.49df9p+6, 0x1.49e042p+6, -0x7.a9fa6p-32, 0x1.67e1dap-4, -0x2.324d7p-12, -0xc.0e669p-12 }, /* 26 */
> + { 0x1.56702ep+6, 0x1.56708p+6, 0x1.567116p+6, -0x5.026808p-24, -0x1.613798p-4, 0x2.114594p-12, 0x1.a22402p-8 }, /* 27 */
> + { 0x1.630126p+6, 0x1.63017p+6, 0x1.630226p+6, 0x4.46aa2p-24, 0x1.5ae8c2p-4, -0x1.f4aaa4p-12, -0x3.58593cp-8 }, /* 28 */
> + { 0x1.6f9234p+6, 0x1.6f926p+6, 0x1.6f92b2p+6, 0x1.5cfccp-24, -0x1.54ed76p-4, 0x1.dd540ap-12, -0xb.e9429p-12 }, /* 29 */
> + { 0x1.7c22fep+6, 0x1.7c2352p+6, 0x1.7c23c2p+6, -0xb.4dc4cp-28, 0x1.4f3ebcp-4, -0x1.c463fp-12, -0x1.e94c54p-8 }, /* 30 */
> + { 0x1.88b412p+6, 0x1.88b444p+6, 0x1.88b50ap+6, 0x3.f5343p-24, -0x1.49d668p-4, 0x1.a53f24p-12, 0x5.128198p-8 }, /* 31 */
> + { 0x1.9544dcp+6, 0x1.954538p+6, 0x1.95459p+6, -0x6.e6f32p-28, 0x1.44aefap-4, -0x1.9a6ef8p-12, -0x6.c639cp-8 }, /* 32 */
> + { 0x1.a1d5fap+6, 0x1.a1d62cp+6, 0x1.a1d674p+6, 0x1.f359c2p-28, -0x1.3fc386p-4, 0x1.887ebep-12, 0x1.6c813cp-8 }, /* 33 */
> + { 0x1.ae66cp+6, 0x1.ae672p+6, 0x1.ae6788p+6, -0x2.9de748p-24, 0x1.3b0fa4p-4, -0x1.777f26p-12, 0x1.c128ccp-8 }, /* 34 */
> + { 0x1.baf7c2p+6, 0x1.baf816p+6, 0x1.baf86cp+6, -0x2.24ccc8p-24, -0x1.368f5cp-4, 0x1.62bd9ep-12, 0xa.df002p-8 }, /* 35 */
> + { 0x1.c788dap+6, 0x1.c7890cp+6, 0x1.c7896cp+6, 0x4.7dcea8p-24, 0x1.323f16p-4, -0x1.61abf4p-12, 0xa.ad73ep-8 }, /* 36 */
> + { 0x1.d419ccp+6, 0x1.d41a02p+6, 0x1.d41a68p+6, -0x4.b538p-24, -0x1.2e1b98p-4, 0x1.4a4d64p-12, 0x3.a47674p-8 }, /* 37 */
> + { 0x1.e0aacep+6, 0x1.e0aaf8p+6, 0x1.e0ab5ep+6, 0x3.09dc4cp-24, 0x1.2a21ecp-4, -0x1.3fa20cp-12, 0x2.216e8cp-8 }, /* 38 */
> + { 0x1.ed3bb8p+6, 0x1.ed3beep+6, 0x1.ed3c56p+6, 0x4.d5a58p-28, -0x1.264f66p-4, 0x1.32c4cep-12, 0x1.53cbb4p-8 }, /* 39 */
> + { 0x1.f9ccaep+6, 0x1.f9cce6p+6, 0x1.f9cd52p+6, 0x3.f4c44cp-24, 0x1.22a192p-4, -0x1.1f8514p-12, -0xc.0de32p-8 }, /* 40 */
> + { 0x1.032ed6p+7, 0x1.032eeep+7, 0x1.032f0cp+7, 0x2.4beae8p-24, -0x1.1f1634p-4, 0x1.171664p-12, 0x1.72a654p-4 }, /* 41 */
> + { 0x1.097756p+7, 0x1.09776ap+7, 0x1.09779cp+7, -0xd.a581ep-28, 0x1.1bab3cp-4, -0xf.9f507p-16, -0xc.ba2d4p-8 }, /* 42 */
> + { 0x1.0fbfdp+7, 0x1.0fbfe6p+7, 0x1.0fbff6p+7, 0xa.7c0bdp-28, -0x1.185eccp-4, 0x1.01d7dep-12, -0x1.a2186ep-4 }, /* 43 */
> + { 0x1.160856p+7, 0x1.160862p+7, 0x1.16087ap+7, -0x1.9452ecp-24, 0x1.152f26p-4, -0x1.07b4aap-12, 0x1.6bbd7ep-4 }, /* 44 */
> + { 0x1.1c50dp+7, 0x1.1c50dep+7, 0x1.1c5118p+7, 0x3.83b7b8p-24, -0x1.121ab2p-4, 0x1.0e938cp-12, -0x5.1a6dfp-8 }, /* 45 */
> + { 0x1.22995p+7, 0x1.22995ap+7, 0x1.229976p+7, -0x6.5ca3a8p-24, 0x1.0f1ff2p-4, -0xe.f198p-16, -0x3.8e98b8p-8 }, /* 46 */
> + { 0x1.28e1ccp+7, 0x1.28e1d8p+7, 0x1.28e1f4p+7, -0x6.bb61ap-24, -0x1.0c3d8ap-4, 0xf.a14b9p-16, 0x9.81e82p-4 }, /* 47 */
> + { 0x1.2f2a48p+7, 0x1.2f2a54p+7, 0x1.2f2a74p+7, 0x2.2438p-24, 0x1.097236p-4, -0xd.fed5ep-16, -0x3.19eb5cp-8 }, /* 48 */
> + { 0x1.3572b8p+7, 0x1.3572dp+7, 0x1.3572ecp+7, 0x3.1e0054p-24, -0x1.06bcc8p-4, 0xd.d2596p-16, -0x1.67e00ap-4 }, /* 49 */
> + { 0x1.3bbb3ep+7, 0x1.3bbb4ep+7, 0x1.3bbb6ap+7, 0x7.46c908p-24, 0x1.041c28p-4, -0xd.04045p-16, -0x8.fb297p-8 }, /* 50 */
> + { 0x1.4203aep+7, 0x1.4203cap+7, 0x1.4203e6p+7, -0xb.4f158p-28, -0x1.018f52p-4, 0xc.ccf6fp-16, 0x1.4d5dp-4 }, /* 51 */
> + { 0x1.484c38p+7, 0x1.484c46p+7, 0x1.484c56p+7, -0x6.5a89c8p-24, 0xf.f155p-8, -0xc.5d21dp-16, -0xd.aca34p-8 }, /* 52 */
> + { 0x1.4e94b8p+7, 0x1.4e94c4p+7, 0x1.4e94d4p+7, -0x1.ef16c8p-24, -0xf.cad3fp-8, 0xb.d75f8p-16, 0x1.f36732p-4 }, /* 53 */
> + { 0x1.54dd36p+7, 0x1.54dd4p+7, 0x1.54dd52p+7, -0x6.1e7b68p-24, 0xf.a564cp-8, -0xb.ec1cfp-16, 0xe.e7421p-8 }, /* 54 */
> + { 0x1.5b25aep+7, 0x1.5b25bep+7, 0x1.5b25d4p+7, -0xf.8c858p-28, -0xf.80faep-8, 0xb.8b6c5p-16, -0x5.835ed8p-8 }, /* 55 */
> + { 0x1.616e34p+7, 0x1.616e3cp+7, 0x1.616e4ep+7, 0x7.75d858p-24, 0xf.5d8abp-8, -0xb.b3779p-16, 0x2.40b948p-4 }, /* 56 */
> + { 0x1.67b6bp+7, 0x1.67b6b8p+7, 0x1.67b6dp+7, 0x1.d78632p-24, -0xf.3b096p-8, 0xa.daf89p-16, 0x1.aa8548p-8 }, /* 57 */
> + { 0x1.6dff28p+7, 0x1.6dff36p+7, 0x1.6dff54p+7, 0x3.b24794p-24, 0xf.196c7p-8, -0xb.1afe1p-16, -0x1.77538cp-8 }, /* 58 */
> + { 0x1.7447a2p+7, 0x1.7447b2p+7, 0x1.7447cap+7, 0x6.39cbc8p-24, -0xe.f8aa5p-8, 0xa.50daap-16, 0x1.9592c2p-8 }, /* 59 */
> + { 0x1.7a902p+7, 0x1.7a903p+7, 0x1.7a903ep+7, -0x1.820e3ap-24, 0xe.d8b9dp-8, -0xa.998cp-16, -0x2.c35d78p-4 }, /* 60 */
> + { 0x1.80d89ep+7, 0x1.80d8aep+7, 0x1.80d8bep+7, -0x2.c7e038p-24, -0xe.b9925p-8, 0x9.ce06p-16, -0x2.2b3054p-4 }, /* 61 */
> + { 0x1.87211cp+7, 0x1.87212cp+7, 0x1.872144p+7, 0x6.ab31c8p-24, 0xe.9b2bep-8, -0x9.4de7p-16, -0x1.32cb5ep-4 }, /* 62 */
> + { 0x1.8d699ap+7, 0x1.8d69a8p+7, 0x1.8d69bp+7, 0x4.4ef25p-24, -0xe.7d7ecp-8, 0x9.a0f1ep-16, 0x1.6ac076p-4 }, /* 63 */
> +};
Same as before about fitting on 78 columns.
> +
> +/* Formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf:
> + y0(x) ~ sqrt(2/(pi*x))*beta0(x)*sin(x-pi/4-alpha0(x))
> + where beta0(x) = 1 - 1/(16*x^2) + 53/(512*x^4)
> + and alpha0(x) = 1/(8*x) - 25/(384*x^3). */
> +static float
> +y0f_asympt (float x)
> +{
> + /* The following code fails to give an error <= 9 ulps in only two cases,
> + for which we tabulate the correctly-rounded result. */
> + if (x == 0x1.bfad96p+7f)
> + return -0x7.f32bdp-32f;
> + if (x == 0x1.2e2a42p+17f)
> + return 0x1.a48974p-40f;
> + double y = 1.0 / (double) x;
> + double y2 = y * y;
> + double beta0 = 1.0f + y2 * (-0x1p-4f + 0x1.a8p-4 * y2);
> + double alpha0 = y * (0x2p-4 - 0x1.0aaaaap-4 * y2);
> + double h;
> + int n;
> + h = reduce_aux (x, &n, alpha0);
> + /* Now x - pi/4 - alpha0 = h + n*pi/2 mod (2*pi). */
> + float xr = (float) h;
> + n = n & 3;
> + float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */
> + float t = cst / sqrtf (x) * (float) beta0;
> + if (n == 0)
> + return t * sinf (xr);
> + else if (n == 2) /* sin(x+pi) = -sin(x) */
> + return -t * sinf (xr);
> + else if (n == 1) /* sin(x+pi/2) = cos(x) */
> + return t * cosf (xr);
> + else /* sin(x+3pi/2) = -cos(x) */
> + return -t * cosf (xr);
> +}
Same as before regarding using internal symbols to avoid PLT calls.
> +
> +/* Special code for x near a root of y0.
> + z is the value computed by the generic code.
> + For small x, use a polynomial approximating y0 around its root.
> + For large x, use an asymptotic formula (y0f_asympt). */
> +static float
> +y0f_near_root (float x, float z)
> +{
> + float index_f;
> + int index;
> +
> + index_f = roundf ((x - FIRST_ZERO_Y0) / (float) M_PI);
> + if (index_f >= SMALL_SIZE)
> + return y0f_asympt (x);
> + index = (int) index_f;
> + const float *p = Py[index];
> + float x0 = p[0];
> + float x1 = p[2];
> + /* If not in the interval [x0,x1] around xmid, return the value z. */
> + if (! (x0 <= x && x <= x1))
> + return z;
> + float xmid = p[1];
> + float y = x - xmid;
> + /* For degree 0 use a degree-5 polynomial, where the coefficients of
> + degree 4 and 5 are hard-coded. */
> + float p6 = (index > 0) ? p[6] : p[6] + y * (-0x3.a46c9p-4 + y * 0x3.735478p-4);
> + float res = p[3] + y * (p[4] + y * (p[5] + y * p6));
> + return res;
> +}
> +
> float
> __ieee754_y0f(float x)
> {
> @@ -124,7 +452,8 @@ __ieee754_y0f(float x)
> if(ix>=0x7f800000) return one/(x+x*x);
> if(ix==0) return -1/zero; /* -inf and divide by zero exception. */
> if(hx<0) return zero/(zero*x);
> - if(ix >= 0x40000000) { /* |x| >= 2.0 */
> + if(ix >= 0x40000000 || (0x3f5340ed <= ix && ix <= 0x3f77b5e5)) {
> + /* |x| >= 2.0 or 0x1.a681dap-1 <= |x| <= 0x1.ef6bcap-1 (around 1st zero) */
> /* y0(x) = sqrt(2/(pi*x))*(p0(x)*sin(x0)+q0(x)*cos(x0))
> * where x0 = x-pi/4
> * Better formula:
> @@ -136,6 +465,7 @@ __ieee754_y0f(float x)
> * sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x))
> * to compute the worse one.
> */
> + SET_RESTORE_ROUNDF (FE_TONEAREST);
> __sincosf (x, &s, &c);
> ss = s-c;
> cc = s+c;
> @@ -143,17 +473,25 @@ __ieee754_y0f(float x)
> * j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x)
> * y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x)
> */
> - if(ix<0x7f000000) { /* make sure x+x not overflow */
> - z = -__cosf(x+x);
> - if ((s*c)<zero) cc = z/ss;
> - else ss = z/cc;
> - }
> - if(ix>0x5c000000) z = (invsqrtpi*ss)/sqrtf(x);
> - else {
> - u = pzerof(x); v = qzerof(x);
> - z = invsqrtpi*(u*ss+v*cc)/sqrtf(x);
> - }
> - return z;
> + if (ix >= 0x7f000000) /* x >= 2^127: use asymptotic expansion. */
> + return y0f_asympt (x);
> + /* Now we are sure that x+x cannot overflow. */
> + z = -__cosf(x+x);
> + if ((s*c)<zero) cc = z/ss;
> + else ss = z/cc;
> + if (ix <= 0x5c000000)
> + {
> + u = pzerof(x); v = qzerof(x);
> + ss = u*ss+v*cc;
> + }
> + z = (invsqrtpi*ss)/sqrtf(x);
> + /* The following threshold is optimal (determined on
> + aarch64-linux-gnu). */
> + float threshold = 0x1.be585ap-4;
> + if (fabsf (ss) > threshold)
> + return z;
> + else
> + return y0f_near_root (x, z);
> }
> if(ix<=0x39800000) { /* x < 2**-13 */
> return(u00 + tpi*__ieee754_logf(x));
> @@ -165,7 +503,7 @@ __ieee754_y0f(float x)
> }
> libm_alias_finite (__ieee754_y0f, __y0f)
>
> -/* The asymptotic expansions of pzero is
> +/* The asymptotic expansion of pzero is
> * 1 - 9/128 s^2 + 11025/98304 s^4 - ..., where s = 1/x.
> * For x >= 2, We approximate pzero by
> * pzero(x) = 1 + (R/S)
> @@ -257,7 +595,7 @@ pzerof(float x)
> }
>
>
> -/* For x >= 8, the asymptotic expansions of qzero is
> +/* For x >= 8, the asymptotic expansion of qzero is
> * -1/8 s + 75/1024 s^3 - ..., where s = 1/x.
> * We approximate pzero by
> * qzero(x) = s*(-1.25 + (R/S))
>
thank you Adhemerval for your review. I have taken into account your comments
in my local copy, and will wait for your review of parts 2 and 3 to submit
a new version.
Paul
On 05/03/2021 11:51, Paul Zimmermann wrote:
> thank you Adhemerval for your review. I have taken into account your comments
> in my local copy, and will wait for your review of parts 2 and 3 to submit
> a new version.
>
> Paul
>
I will finish the review of the 2/3 and 3/4 as well.
@@ -16,7 +16,9 @@
#include <math.h>
#include <math-barriers.h>
#include <math_private.h>
+#include <fenv_private.h>
#include <libm-alias-finite.h>
+#include <s_sincosf.h>
static float pzerof(float), qzerof(float);
@@ -37,6 +39,190 @@ S04 = 1.1661400734e-09; /* 0x30a045e8 */
static const float zero = 0.0;
+#define FIRST_ZERO_J0 2.404825f
+
+#define SMALL_SIZE 64
+
+/* The following table contains successive zeros of j0 and degree-3 polynomial
+ approximations of j0 around these zeros: Pj[0] for the first zero (2.40482),
+ Pj[1] for the second one (5.520078), and so on. Each line contains:
+ {x0, xmid, x1, p0, p1, p2, p3}
+ where [x0,x1] is the interval around the zero, xmid is the binary32 number
+ closest to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation
+ polynomial. Each polynomial was generated using Sollya on the interval
+ [x0,x1] around the corresponding zero where the error is larger than 9 ulps
+ for the alternate code. Degree 3 is enough to get an error <= 9 ulps.
+*/
+static const float Pj[SMALL_SIZE][7] = {
+ /* The following polynomial was optimized by hand with respect to the one
+ generated by Sollya, to ensure the maximal error is at most 9 ulps,
+ both if the polynomial is evaluated with fma or not. */
+ { 0x1.31e5c4p+1, 0x1.33d152p+1, 0x1.3b58dep+1, 0xf.2623fp-28, -0x8.4e6d7p-4, 0x1.ba2aaap-4, 0xe.4b9ap-8 }, /* 0 */
+ { 0x1.60eafap+2, 0x1.6148f6p+2, 0x1.62955cp+2, 0x6.9205fp-28, 0x5.71b98p-4, -0x7.e3e798p-8, -0xd.87d1p-8 }, /* 1 */
+ { 0x1.14cde2p+3, 0x1.14eb56p+3, 0x1.1525c6p+3, 0x1.bcc1cap-24, -0x4.57de6p-4, 0x4.03e7cp-8, 0xb.39a37p-8 }, /* 2 */
+ { 0x1.7931d8p+3, 0x1.79544p+3, 0x1.7998d6p+3, -0xf.2976fp-32, 0x3.b827ccp-4, -0x2.8603ep-8, -0x9.bf49bp-8 }, /* 3 */
+ { 0x1.ddb6d4p+3, 0x1.ddca14p+3, 0x1.ddf0c8p+3, -0x1.bd67d8p-28, -0x3.4e03ap-4, 0x1.c562a2p-8, 0x8.90ec2p-8 }, /* 4 */
+ { 0x1.2118e4p+4, 0x1.212314p+4, 0x1.21375p+4, 0x1.62209cp-28, 0x3.00efecp-4, -0x1.5458dap-8, -0x8.10063p-8 }, /* 5 */
+ { 0x1.535d28p+4, 0x1.5362dep+4, 0x1.536e48p+4, -0x2.853f74p-24, -0x2.c5b274p-4, 0x1.0b9db4p-8, 0x7.8c3578p-8 }, /* 6 */
+ { 0x1.859ddp+4, 0x1.85a3bap+4, 0x1.85aff4p+4, 0x2.19ed1cp-24, 0x2.96545cp-4, -0xd.997e6p-12, -0x6.d9af28p-8 }, /* 7 */
+ { 0x1.b7decap+4, 0x1.b7e54ap+4, 0x1.b7f038p+4, 0xe.959aep-28, -0x2.6f5594p-4, 0xb.538dp-12, 0x7.003ea8p-8 }, /* 8 */
+ { 0x1.ea21c6p+4, 0x1.ea275ap+4, 0x1.ea337ap+4, 0x2.0c3964p-24, 0x2.4e80fcp-4, -0x9.a2216p-12, -0x6.61e0a8p-8 }, /* 9 */
+ { 0x1.0e3316p+5, 0x1.0e34e2p+5, 0x1.0e379ap+5, -0x3.642554p-24, -0x2.325e48p-4, 0x8.4f49cp-12, 0x7.d37c3p-8 }, /* 10 */
+ { 0x1.275456p+5, 0x1.275638p+5, 0x1.2759e2p+5, 0x1.6c015ap-24, 0x2.19e7d8p-4, -0x7.4c1bf8p-12, -0x4.af7ef8p-8 }, /* 11 */
+ { 0x1.4075ecp+5, 0x1.4077a8p+5, 0x1.407b96p+5, -0x4.b18c9p-28, -0x2.046174p-4, 0x6.705618p-12, 0x5.f2d28p-8 }, /* 12 */
+ { 0x1.59973p+5, 0x1.59992cp+5, 0x1.599b2ap+5, -0x1.8b8792p-24, 0x1.f13fbp-4, -0x5.c14938p-12, -0x5.73e0cp-8 }, /* 13 */
+ { 0x1.72b958p+5, 0x1.72bacp+5, 0x1.72bc5ap+5, 0x3.a26e0cp-24, -0x1.e018dap-4, 0x5.30e8dp-12, 0x2.81099p-8 }, /* 14 */
+ { 0x1.8bdb4ap+5, 0x1.8bdc62p+5, 0x1.8bde7ep+5, -0x2.18fabcp-24, 0x1.d09b22p-4, -0x4.b0b688p-12, -0x5.5fd308p-8 }, /* 15 */
+ { 0x1.a4fcecp+5, 0x1.a4fe0ep+5, 0x1.a50042p+5, 0x3.2370e8p-24, -0x1.c28614p-4, 0x4.4647e8p-12, 0x5.68a28p-8 }, /* 16 */
+ { 0x1.be1ebcp+5, 0x1.be1fc4p+5, 0x1.be21fp+5, -0x5.9eae3p-28, 0x1.b5a622p-4, -0x3.eb9054p-12, -0x5.12d8cp-8 }, /* 17 */
+ { 0x1.d7405p+5, 0x1.d7418p+5, 0x1.d74294p+5, 0x2.9fa1e8p-24, -0x1.a9d184p-4, 0x3.9d1e7p-12, 0x4.33d058p-8 }, /* 18 */
+ { 0x1.f0624p+5, 0x1.f06344p+5, 0x1.f0645ep+5, 0x9.9ac67p-28, 0x1.9ee5eep-4, -0x3.5816e8p-12, -0x2.6e5004p-8 }, /* 19 */
+ { 0x1.04c22ep+6, 0x1.04c286p+6, 0x1.04c316p+6, 0xd.6ab94p-28, -0x1.94c6f6p-4, 0x3.174efcp-12, 0x7.9a092p-8 }, /* 20 */
+ { 0x1.1153p+6, 0x1.11536cp+6, 0x1.11541p+6, -0x4.4cb2d8p-24, 0x1.8b5cccp-4, -0x2.e3c238p-12, -0x4.e5437p-8 }, /* 21 */
+ { 0x1.1de3d8p+6, 0x1.1de456p+6, 0x1.1de4dap+6, -0x4.4aa8c8p-24, -0x1.829356p-4, 0x2.b45124p-12, 0x5.baf638p-8 }, /* 22 */
+ { 0x1.2a74f8p+6, 0x1.2a754p+6, 0x1.2a75bp+6, 0x2.077c38p-24, 0x1.7a597ep-4, -0x2.8a0414p-12, -0x2.838d3p-8 }, /* 23 */
+ { 0x1.3705d4p+6, 0x1.37062cp+6, 0x1.3706b2p+6, -0x2.6a6cd8p-24, -0x1.72a09ap-4, 0x2.623a3cp-12, 0x5.5256a8p-8 }, /* 24 */
+ { 0x1.4396dp+6, 0x1.439718p+6, 0x1.43976ep+6, -0x5.08287p-24, 0x1.6b5c06p-4, -0x2.3da154p-12, -0x7.a2254p-8 }, /* 25 */
+ { 0x1.5027acp+6, 0x1.502808p+6, 0x1.50288cp+6, -0x3.4598dcp-24, -0x1.6480c4p-4, 0x2.1cb944p-12, 0x7.27c77p-8 }, /* 26 */
+ { 0x1.5cb89ap+6, 0x1.5cb8f8p+6, 0x1.5cb97ep+6, 0x5.4e74bp-24, 0x1.5e0544p-4, -0x2.00b158p-12, -0x5.9bc4a8p-8 }, /* 27 */
+ { 0x1.69498cp+6, 0x1.6949e8p+6, 0x1.694a42p+6, -0x2.05751cp-24, -0x1.57e12p-4, 0x1.e78edcp-12, 0x9.9667dp-8 }, /* 28 */
+ { 0x1.75da7ep+6, 0x1.75dadap+6, 0x1.75db3p+6, 0x4.c5e278p-24, 0x1.520ceep-4, -0x1.d0127cp-12, -0xd.62681p-8 }, /* 29 */
+ { 0x1.826b7ep+6, 0x1.826bccp+6, 0x1.826c2cp+6, -0x3.50e62cp-24, -0x1.4c822p-4, 0x1.ba5832p-12, -0x1.eb2ee2p-8 }, /* 30 */
+ { 0x1.8efc84p+6, 0x1.8efcbep+6, 0x1.8efd16p+6, -0x1.c39f38p-24, 0x1.473ae6p-4, -0x1.a616c8p-12, 0xf.f352ap-12 }, /* 31 */
+ { 0x1.9b8d84p+6, 0x1.9b8db2p+6, 0x1.9b8e7p+6, -0x1.9245b6p-28, -0x1.42320ap-4, 0x1.932a04p-12, 0x2.dc113cp-8 }, /* 32 */
+ { 0x1.a81e72p+6, 0x1.a81ea6p+6, 0x1.a81f04p+6, -0x1.0acf8p-24, 0x1.3d62e6p-4, -0x1.7c4b14p-12, -0x1.cfc5c2p-4 }, /* 33 */
+ { 0x1.b4af6ap+6, 0x1.b4af9ap+6, 0x1.b4afeep+6, 0x4.cd92d8p-24, -0x1.38c94ap-4, 0x1.643154p-12, 0x1.4c2a06p-4 }, /* 34 */
+ { 0x1.c1406p+6, 0x1.c1409p+6, 0x1.c140cp+6, -0x1.37bf8ap-24, 0x1.34617p-4, -0x1.5f504ap-12, -0x1.e2d324p-4 }, /* 35 */
+ { 0x1.cdd154p+6, 0x1.cdd186p+6, 0x1.cdd1eap+6, -0x1.8f62dep-28, -0x1.3027fp-4, 0x1.534a02p-12, 0x2.c7f144p-12 }, /* 36 */
+ { 0x1.da6248p+6, 0x1.da627cp+6, 0x1.da62e6p+6, -0x9.81e79p-28, 0x1.2c19b4p-4, -0x1.4b8288p-12, 0x7.2d8bap-8 }, /* 37 */
+ { 0x1.e6f33ep+6, 0x1.e6f372p+6, 0x1.e6f3a8p+6, 0x3.103b3p-24, -0x1.2833eep-4, 0x1.36f4d2p-12, 0x9.29f91p-8 }, /* 38 */
+ { 0x1.f38434p+6, 0x1.f3846ap+6, 0x1.f384d8p+6, 0x2.07b058p-24, 0x1.24740ap-4, -0x1.2ee58ap-12, 0xd.f1393p-12 }, /* 39 */
+ { 0x1.000a98p+7, 0x1.000abp+7, 0x1.000ac8p+7, 0x3.87576cp-24, -0x1.20d7b6p-4, 0x1.2083e2p-12, 0x3.9a7aap-8 }, /* 40 */
+ { 0x1.06531p+7, 0x1.06532cp+7, 0x1.065348p+7, -0x1.691ecp-24, 0x1.1d5ccap-4, -0x1.166726p-12, -0x1.e4af48p-8 }, /* 41 */
+ { 0x1.0c9b9ap+7, 0x1.0c9ba8p+7, 0x1.0c9bbep+7, 0x9.b406dp-28, -0x1.1a015p-4, 0x1.038f9cp-12, -0x4.021058p-4 }, /* 42 */
+ { 0x1.12e412p+7, 0x1.12e424p+7, 0x1.12e436p+7, -0xf.bfd8fp-28, 0x1.16c37ap-4, -0x1.039edep-12, 0x1.f0033p-4 }, /* 43 */
+ { 0x1.192c92p+7, 0x1.192cap+7, 0x1.192cb6p+7, 0x2.6d50c8p-24, -0x1.13a19ep-4, 0xf.9df8ap-16, 0x4.ecd978p-8 }, /* 44 */
+ { 0x1.1f7512p+7, 0x1.1f751cp+7, 0x1.1f753ap+7, -0x4.d475c8p-24, 0x1.109a32p-4, -0x1.04fb3ap-12, -0xd.c271p-12 }, /* 45 */
+ { 0x1.25bd8ep+7, 0x1.25bd98p+7, 0x1.25bdap+7, 0x8.1982p-24, -0x1.0dabc8p-4, 0xe.88eabp-16, -0x4.ed75dp-4 }, /* 46 */
+ { 0x1.2c060cp+7, 0x1.2c0616p+7, 0x1.2c0644p+7, 0x4.864518p-24, 0x1.0ad51p-4, -0xe.27196p-16, 0xb.97a3ep-8 }, /* 47 */
+ { 0x1.324e86p+7, 0x1.324e92p+7, 0x1.324e9ep+7, 0x6.8917a8p-28, -0x1.0814d4p-4, 0xd.4fe7ep-16, -0x6.8d8d6p-4 }, /* 48 */
+ { 0x1.389702p+7, 0x1.38970ep+7, 0x1.389728p+7, -0x5.fa18fp-24, 0x1.0569fp-4, -0xd.5b0d4p-16, 0x1.50353ap-4 }, /* 49 */
+ { 0x1.3edf84p+7, 0x1.3edf8cp+7, 0x1.3edfaap+7, -0x4.0e5c98p-24, -0x1.02d354p-4, 0xb.7b255p-16, 0x7.8a916p-4 }, /* 50 */
+ { 0x1.4527fp+7, 0x1.452808p+7, 0x1.452812p+7, -0x2.c3ddbp-24, 0x1.005004p-4, -0xd.7729cp-16, -0x3.bcc354p-8 }, /* 51 */
+ { 0x1.4b7076p+7, 0x1.4b7086p+7, 0x1.4b70a4p+7, -0x5.d052p-24, -0xf.ddf16p-8, 0xc.318c1p-16, 0x5.7947p-8 }, /* 52 */
+ { 0x1.51b8f4p+7, 0x1.51b902p+7, 0x1.51b90ep+7, -0x2.0b97dcp-24, 0xf.b7fafp-8, -0xc.1429dp-16, -0x3.43c36p-4 }, /* 53 */
+ { 0x1.580168p+7, 0x1.58018p+7, 0x1.580188p+7, -0x5.4aab5p-24, -0xf.930fep-8, 0xa.ecc24p-16, 0x9.c62cdp-12 }, /* 54 */
+ { 0x1.5e49eap+7, 0x1.5e49fcp+7, 0x1.5e4a12p+7, -0x3.6dadd8p-24, 0xf.6f245p-8, -0xb.6816cp-16, 0xa.d731ap-8 }, /* 55 */
+ { 0x1.649272p+7, 0x1.64927ap+7, 0x1.64929p+7, -0x2.d7e038p-24, -0xf.4c2cep-8, 0xb.118bep-16, 0xb.69a4ep-8 }, /* 56 */
+ { 0x1.6adae6p+7, 0x1.6adaf6p+7, 0x1.6adb04p+7, -0x6.977a1p-24, 0xf.2a1fp-8, -0xa.a8911p-16, -0x4.bf6d2p-8 }, /* 57 */
+ { 0x1.712366p+7, 0x1.712374p+7, 0x1.71238ep+7, 0x1.3cc95ep-24, -0xf.08f0ap-8, 0x9.f0858p-16, 0x1.77f7f4p-4 }, /* 58 */
+ { 0x1.776beap+7, 0x1.776bf2p+7, 0x1.776bfap+7, 0x3.a4921p-24, 0xe.e8986p-8, -0xa.39dfp-16, -0x6.7ba3dp-4 }, /* 59 */
+ { 0x1.7db464p+7, 0x1.7db46ep+7, 0x1.7db476p+7, 0x6.b45a7p-24, -0xe.c90d8p-8, 0xa.e586fp-16, -0x1.d66becp-4 }, /* 60 */
+ { 0x1.83fce2p+7, 0x1.83fcecp+7, 0x1.83fd0ep+7, -0x2.8f34a4p-24, 0xe.aa478p-8, -0x9.810bp-16, -0x3.a5f3fcp-8 }, /* 61 */
+ { 0x1.8a455cp+7, 0x1.8a456ap+7, 0x1.8a4588p+7, -0x1.325968p-24, -0xe.8c3eap-8, 0x9.0a765p-16, 0x1.29a54ap-4 }, /* 62 */
+ { 0x1.908dd8p+7, 0x1.908de8p+7, 0x1.908df4p+7, 0x4.96b808p-24, 0xe.6eeb5p-8, -0x9.0251bp-16, 0x1.41a488p-4 }, /* 63 */
+};
+
+/* Return h and update n such that:
+ Now x - pi/4 - alpha0 = h + n*pi/2 mod (2*pi). */
+static double
+reduce_aux (float x, int *n, double alpha0)
+{
+ double h;
+ h = reduce_large (asuint (x), n);
+ /* Now |x| = h+n*pi/2 mod 2*pi. */
+ /* Recover sign. */
+ if (x < 0)
+ {
+ h = -h;
+ *n = -*n;
+ }
+ /* Subtract pi/4. */
+ double piover2 = 0xc.90fdaa22168cp-3;
+ if (h >= 0)
+ h -= piover2 / 2;
+ else
+ {
+ h += piover2 / 2;
+ (*n) --;
+ }
+ /* Subtract alpha0 and reduce if needed mod pi/2. */
+ h -= alpha0;
+ if (h > piover2)
+ {
+ h -= piover2;
+ (*n) ++;
+ }
+ else if (h < -piover2)
+ {
+ h += piover2;
+ (*n) --;
+ }
+ return h;
+}
+
+/* Formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf:
+ j0f(x) ~ sqrt(2/(pi*x))*beta0(x)*cos(x-pi/4-alpha0(x))
+ where beta0(x) = 1 - 1/(16*x^2) + 53/(512*x^4)
+ and alpha0(x) = 1/(8*x) - 25/(384*x^3). */
+static float
+j0f_asympt (float x)
+{
+ /* The following code fails to give an error <= 9 ulps in only two cases,
+ for which we tabulate the result. */
+ if (x == 0x1.4665d2p+24f)
+ return 0xa.50206p-52f;
+ if (x == 0x1.a9afdep+7f)
+ return 0xf.47039p-28f;
+ double y = 1.0 / (double) x;
+ double y2 = y * y;
+ double beta0 = 1.0f + y2 * (-0x1p-4f + 0x1.a8p-4 * y2);
+ double alpha0 = y * (0x2p-4 - 0x1.0aaaaap-4 * y2);
+ double h;
+ int n;
+ h = reduce_aux (x, &n, alpha0);
+ /* Now x - pi/4 - alpha0 = h + n*pi/2 mod (2*pi). */
+ float xr = (float) h;
+ n = n & 3;
+ float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */
+ float t = cst / sqrtf (x) * (float) beta0;
+ if (n == 0)
+ return t * cosf (xr);
+ else if (n == 2) /* cos(x+pi) = -cos(x) */
+ return -t * cosf (xr);
+ else if (n == 1) /* cos(x+pi/2) = -sin(x) */
+ return -t * sinf (xr);
+ else /* cos(x+3pi/2) = sin(x) */
+ return t * sinf (xr);
+}
+
+/* Special code for x near a root of j0.
+ z is the value computed by the generic code.
+ For small x, we use a polynomial approximating j0 around its root.
+ For large x, we use an asymptotic formula (j0f_asympt). */
+static float
+j0f_near_root (float x, float z)
+{
+ float index_f;
+ int index;
+
+ index_f = roundf ((x - FIRST_ZERO_J0) / (float) M_PI);
+ /* j0f_asympt fails to give an error <= 9 ulps for x=0x1.324e92p+7 (index 48)
+ thus we can't reduce SMALL_SIZE below 49. */
+ if (index_f >= SMALL_SIZE)
+ return j0f_asympt (x);
+ index = (int) index_f;
+ const float *p = Pj[index];
+ float x0 = p[0];
+ float x1 = p[2];
+ /* If not in the interval [x0,x1] around xmid, we return the value z. */
+ if (! (x0 <= x && x <= x1))
+ return z;
+ float xmid = p[1];
+ float y = x - xmid;
+ return p[3] + y * (p[4] + y * (p[5] + y * p[6]));
+}
+
float
__ieee754_j0f(float x)
{
@@ -48,39 +234,34 @@ __ieee754_j0f(float x)
if(ix>=0x7f800000) return one/(x*x);
x = fabsf(x);
if(ix >= 0x40000000) { /* |x| >= 2.0 */
+ SET_RESTORE_ROUNDF (FE_TONEAREST);
__sincosf (x, &s, &c);
ss = s-c;
cc = s+c;
- if(ix<0x7f000000) { /* make sure x+x not overflow */
- z = -__cosf(x+x);
- if ((s*c)<zero) cc = z/ss;
- else ss = z/cc;
- } else {
- /* We subtract (exactly) a value x0 such that
- cos(x0)+sin(x0) is very near to 0, and use the identity
- sin(x-x0) = sin(x)*cos(x0)-cos(x)*sin(x0) to get
- sin(x) + cos(x) with extra accuracy. */
- float x0 = 0xe.d4108p+124f;
- float y = x - x0; /* exact */
- /* sin(y) = sin(x)*cos(x0)-cos(x)*sin(x0) */
- z = __sinf (y);
- float eps = 0x1.5f263ep-24f;
- /* cos(x0) ~ -sin(x0) + eps */
- z += eps * __cosf (x);
- /* now z ~ (sin(x)-cos(x))*cos(x0) */
- float cosx0 = -0xb.504f3p-4f;
- cc = z / cosx0;
- }
+ if (ix >= 0x7f000000) /* x >= 2^127: use asymptotic expansion. */
+ return j0f_asympt (x);
+ /* Now we are sure that x+x cannot overflow. */
+ z = -__cosf(x+x);
+ if ((s*c)<zero) cc = z/ss;
+ else ss = z/cc;
/*
* j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x)
* y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x)
*/
- if(ix>0x5c000000) z = (invsqrtpi*cc)/sqrtf(x);
- else {
- u = pzerof(x); v = qzerof(x);
- z = invsqrtpi*(u*cc-v*ss)/sqrtf(x);
- }
- return z;
+ if (ix <= 0x5c000000)
+ {
+ u = pzerof(x); v = qzerof(x);
+ cc = u*cc-v*ss;
+ }
+ z = (invsqrtpi * cc) / sqrtf(x);
+ /* The following threshold is optimal: for x=0x1.3b58dep+1
+ and rounding upwards, |cc|=0x1.579b26p-4 and z is 10 ulps
+ far from the correctly rounded value. */
+ float threshold = 0x1.579b26p-4;
+ if (fabsf (cc) > threshold)
+ return z;
+ else
+ return j0f_near_root (x, z);
}
if(ix<0x39000000) { /* |x| < 2**-13 */
math_force_eval(huge+x); /* raise inexact if x != 0 */
@@ -112,6 +293,153 @@ v02 = 7.6006865129e-05, /* 0x389f65e0 */
v03 = 2.5915085189e-07, /* 0x348b216c */
v04 = 4.4111031494e-10; /* 0x2ff280c2 */
+#define FIRST_ZERO_Y0 0.893576f
+
+#define SMALL_SIZE 64
+
+/* The following table contains successive zeros of y0 and degree-3 polynomial
+ approximations of y0 around these zeros: Py[0] for the first zero (0.89358),
+ Py[1] for the second one (3.957678), and so on. Each line contains:
+ {x0, xmid, x1, p0, p1, p2, p3}
+ where [x0,x1] is the interval around the zero, xmid is the binary32 number
+ closest to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation
+ polynomial. Each polynomial was generated using Sollya on the interval
+ [x0,x1] around the corresponding zero where the error is larger than 9 ulps
+ for the alternate code. Degree 3 is enough, except for index 0 where we use
+ degree 5, and the coefficients of degree 4 and 5 are hard-coded in
+ y0f_near_root.
+*/
+static const float Py[SMALL_SIZE][7] = {
+ { 0x1.a681dap-1, 0x1.c982ecp-1, 0x1.ef6bcap-1, 0x3.274468p-28, 0xe.121b8p-4, -0x7.df8b3p-4, 0x3.877be4p-4/*, -0x3.a46c9p-4, 0x3.735478p-4*/ }, /* 0 */
+ { 0x1.f957c6p+1, 0x1.fa9534p+1, 0x1.fd11b2p+1, 0xa.f1f83p-28, -0x6.70d098p-4, 0xd.04d48p-8, 0xe.f61a9p-8 }, /* 1 */
+ { 0x1.c51832p+2, 0x1.c581dcp+2, 0x1.c65164p+2, -0x5.e2a51p-28, 0x4.cd3328p-4, -0x5.6bbe08p-8, -0xc.46d8p-8 }, /* 2 */
+ { 0x1.46fd84p+3, 0x1.471d74p+3, 0x1.475bfcp+3, -0x1.4b0aeep-24, -0x3.fec6b8p-4, 0x3.2068a4p-8, 0xa.76e2dp-8 }, /* 3 */
+ { 0x1.ab7afap+3, 0x1.ab8e1cp+3, 0x1.abb294p+3, -0x8.179d7p-28, 0x3.7e6544p-4, -0x2.1799fp-8, -0x9.0e1c4p-8 }, /* 4 */
+ { 0x1.07f9aap+4, 0x1.0803c8p+4, 0x1.08170cp+4, -0x2.5b8078p-24, -0x3.24b868p-4, 0x1.8631ecp-8, 0x8.3cb46p-8 }, /* 5 */
+ { 0x1.3a38eap+4, 0x1.3a42cep+4, 0x1.3a4d8ap+4, 0x1.cd304ap-28, 0x2.e189ecp-4, -0x1.2c6954p-8, -0x7.8178ep-8 }, /* 6 */
+ { 0x1.6c7d42p+4, 0x1.6c833p+4, 0x1.6c99fp+4, -0x6.c63f1p-28, -0x2.acc9a8p-4, 0xf.09e31p-12, 0x7.0b5ab8p-8 }, /* 7 */
+ { 0x1.9ebec4p+4, 0x1.9ec47p+4, 0x1.9ed016p+4, 0x1.e53838p-24, 0x2.81f2p-4, -0xc.5ff51p-12, -0x7.05ep-8 }, /* 8 */
+ { 0x1.d1008ep+4, 0x1.d10644p+4, 0x1.d11262p+4, 0x1.636feep-24, -0x2.5e40dcp-4, 0xa.6f81dp-12, 0x5.ff6p-8 }, /* 9 */
+ { 0x1.01a27cp+5, 0x1.01a442p+5, 0x1.01a924p+5, -0x4.04e1bp-28, 0x2.3febd8p-4, -0x8.f11e2p-12, -0x6.0111ap-8 }, /* 10 */
+ { 0x1.1ac3bcp+5, 0x1.1ac588p+5, 0x1.1ac912p+5, 0x3.6063d8p-24, -0x2.25baacp-4, 0x7.c93cdp-12, 0x4.e7577p-8 }, /* 11 */
+ { 0x1.33e508p+5, 0x1.33e6ecp+5, 0x1.33ea1ap+5, -0x3.f39ebcp-24, 0x2.0ed04cp-4, -0x6.d9434p-12, -0x4.fc0b7p-8 }, /* 12 */
+ { 0x1.4d0666p+5, 0x1.4d0868p+5, 0x1.4d0c14p+5, -0x4.ea23p-28, -0x1.fa8b4p-4, 0x6.1470e8p-12, 0x5.97f71p-8 }, /* 13 */
+ { 0x1.6628b8p+5, 0x1.6629f4p+5, 0x1.662e0ep+5, -0x3.5df0c8p-24, 0x1.e8727ep-4, -0x5.76a038p-12, -0x4.ee37c8p-8 }, /* 14 */
+ { 0x1.7f4a7cp+5, 0x1.7f4b9p+5, 0x1.7f4daap+5, 0x1.1ef09ep-24, -0x1.d8293ap-4, 0x4.ed8a28p-12, 0x4.d43708p-8 }, /* 15 */
+ { 0x1.986c5cp+5, 0x1.986d38p+5, 0x1.986f6p+5, 0x1.b70cecp-24, 0x1.c967p-4, -0x4.7a70b8p-12, -0x5.6840e8p-8 }, /* 16 */
+ { 0x1.b18dcap+5, 0x1.b18ee8p+5, 0x1.b19122p+5, 0x1.abaadcp-24, -0x1.bbf246p-4, 0x4.1a35bp-12, 0x3.c2d46p-8 }, /* 17 */
+ { 0x1.caaf86p+5, 0x1.cab0a2p+5, 0x1.cab2fep+5, 0x1.63989ep-24, 0x1.af9cb4p-4, -0x3.c2f2dcp-12, -0x4.cf8108p-8 }, /* 18 */
+ { 0x1.e3d146p+5, 0x1.e3d262p+5, 0x1.e3d492p+5, -0x1.68a8ecp-24, -0x1.a4407ep-4, 0x3.7733ecp-12, 0x5.97916p-8 }, /* 19 */
+ { 0x1.fcf316p+5, 0x1.fcf428p+5, 0x1.fcf59ap+5, 0x1.e1bb5p-24, 0x1.99be74p-4, -0x3.37210cp-12, -0x5.d19bf8p-8 }, /* 20 */
+ { 0x1.0b0a7cp+6, 0x1.0b0afap+6, 0x1.0b0b9cp+6, -0x5.5bbcfp-24, -0x1.8ffc9ap-4, 0x2.ffe638p-12, 0x2.ed28e8p-8 }, /* 21 */
+ { 0x1.179b66p+6, 0x1.179bep+6, 0x1.179d0ap+6, -0x4.9e34a8p-24, 0x1.86e51cp-4, -0x2.cc7a68p-12, -0x3.3642c4p-8 }, /* 22 */
+ { 0x1.242c5cp+6, 0x1.242ccap+6, 0x1.242d68p+6, 0x1.966706p-24, -0x1.7e657p-4, 0x2.9aed4cp-12, 0x7.b87a58p-8 }, /* 23 */
+ { 0x1.30bd62p+6, 0x1.30bdb6p+6, 0x1.30beb2p+6, 0x3.4b3b68p-24, 0x1.766dc2p-4, -0x2.72651cp-12, -0x3.e347f8p-8 }, /* 24 */
+ { 0x1.3d4e56p+6, 0x1.3d4ea2p+6, 0x1.3d4f2ep+6, 0x6.a99008p-28, -0x1.6ef07ep-4, 0x2.53aec4p-12, 0x2.9e3d88p-12 }, /* 25 */
+ { 0x1.49df38p+6, 0x1.49df9p+6, 0x1.49e042p+6, -0x7.a9fa6p-32, 0x1.67e1dap-4, -0x2.324d7p-12, -0xc.0e669p-12 }, /* 26 */
+ { 0x1.56702ep+6, 0x1.56708p+6, 0x1.567116p+6, -0x5.026808p-24, -0x1.613798p-4, 0x2.114594p-12, 0x1.a22402p-8 }, /* 27 */
+ { 0x1.630126p+6, 0x1.63017p+6, 0x1.630226p+6, 0x4.46aa2p-24, 0x1.5ae8c2p-4, -0x1.f4aaa4p-12, -0x3.58593cp-8 }, /* 28 */
+ { 0x1.6f9234p+6, 0x1.6f926p+6, 0x1.6f92b2p+6, 0x1.5cfccp-24, -0x1.54ed76p-4, 0x1.dd540ap-12, -0xb.e9429p-12 }, /* 29 */
+ { 0x1.7c22fep+6, 0x1.7c2352p+6, 0x1.7c23c2p+6, -0xb.4dc4cp-28, 0x1.4f3ebcp-4, -0x1.c463fp-12, -0x1.e94c54p-8 }, /* 30 */
+ { 0x1.88b412p+6, 0x1.88b444p+6, 0x1.88b50ap+6, 0x3.f5343p-24, -0x1.49d668p-4, 0x1.a53f24p-12, 0x5.128198p-8 }, /* 31 */
+ { 0x1.9544dcp+6, 0x1.954538p+6, 0x1.95459p+6, -0x6.e6f32p-28, 0x1.44aefap-4, -0x1.9a6ef8p-12, -0x6.c639cp-8 }, /* 32 */
+ { 0x1.a1d5fap+6, 0x1.a1d62cp+6, 0x1.a1d674p+6, 0x1.f359c2p-28, -0x1.3fc386p-4, 0x1.887ebep-12, 0x1.6c813cp-8 }, /* 33 */
+ { 0x1.ae66cp+6, 0x1.ae672p+6, 0x1.ae6788p+6, -0x2.9de748p-24, 0x1.3b0fa4p-4, -0x1.777f26p-12, 0x1.c128ccp-8 }, /* 34 */
+ { 0x1.baf7c2p+6, 0x1.baf816p+6, 0x1.baf86cp+6, -0x2.24ccc8p-24, -0x1.368f5cp-4, 0x1.62bd9ep-12, 0xa.df002p-8 }, /* 35 */
+ { 0x1.c788dap+6, 0x1.c7890cp+6, 0x1.c7896cp+6, 0x4.7dcea8p-24, 0x1.323f16p-4, -0x1.61abf4p-12, 0xa.ad73ep-8 }, /* 36 */
+ { 0x1.d419ccp+6, 0x1.d41a02p+6, 0x1.d41a68p+6, -0x4.b538p-24, -0x1.2e1b98p-4, 0x1.4a4d64p-12, 0x3.a47674p-8 }, /* 37 */
+ { 0x1.e0aacep+6, 0x1.e0aaf8p+6, 0x1.e0ab5ep+6, 0x3.09dc4cp-24, 0x1.2a21ecp-4, -0x1.3fa20cp-12, 0x2.216e8cp-8 }, /* 38 */
+ { 0x1.ed3bb8p+6, 0x1.ed3beep+6, 0x1.ed3c56p+6, 0x4.d5a58p-28, -0x1.264f66p-4, 0x1.32c4cep-12, 0x1.53cbb4p-8 }, /* 39 */
+ { 0x1.f9ccaep+6, 0x1.f9cce6p+6, 0x1.f9cd52p+6, 0x3.f4c44cp-24, 0x1.22a192p-4, -0x1.1f8514p-12, -0xc.0de32p-8 }, /* 40 */
+ { 0x1.032ed6p+7, 0x1.032eeep+7, 0x1.032f0cp+7, 0x2.4beae8p-24, -0x1.1f1634p-4, 0x1.171664p-12, 0x1.72a654p-4 }, /* 41 */
+ { 0x1.097756p+7, 0x1.09776ap+7, 0x1.09779cp+7, -0xd.a581ep-28, 0x1.1bab3cp-4, -0xf.9f507p-16, -0xc.ba2d4p-8 }, /* 42 */
+ { 0x1.0fbfdp+7, 0x1.0fbfe6p+7, 0x1.0fbff6p+7, 0xa.7c0bdp-28, -0x1.185eccp-4, 0x1.01d7dep-12, -0x1.a2186ep-4 }, /* 43 */
+ { 0x1.160856p+7, 0x1.160862p+7, 0x1.16087ap+7, -0x1.9452ecp-24, 0x1.152f26p-4, -0x1.07b4aap-12, 0x1.6bbd7ep-4 }, /* 44 */
+ { 0x1.1c50dp+7, 0x1.1c50dep+7, 0x1.1c5118p+7, 0x3.83b7b8p-24, -0x1.121ab2p-4, 0x1.0e938cp-12, -0x5.1a6dfp-8 }, /* 45 */
+ { 0x1.22995p+7, 0x1.22995ap+7, 0x1.229976p+7, -0x6.5ca3a8p-24, 0x1.0f1ff2p-4, -0xe.f198p-16, -0x3.8e98b8p-8 }, /* 46 */
+ { 0x1.28e1ccp+7, 0x1.28e1d8p+7, 0x1.28e1f4p+7, -0x6.bb61ap-24, -0x1.0c3d8ap-4, 0xf.a14b9p-16, 0x9.81e82p-4 }, /* 47 */
+ { 0x1.2f2a48p+7, 0x1.2f2a54p+7, 0x1.2f2a74p+7, 0x2.2438p-24, 0x1.097236p-4, -0xd.fed5ep-16, -0x3.19eb5cp-8 }, /* 48 */
+ { 0x1.3572b8p+7, 0x1.3572dp+7, 0x1.3572ecp+7, 0x3.1e0054p-24, -0x1.06bcc8p-4, 0xd.d2596p-16, -0x1.67e00ap-4 }, /* 49 */
+ { 0x1.3bbb3ep+7, 0x1.3bbb4ep+7, 0x1.3bbb6ap+7, 0x7.46c908p-24, 0x1.041c28p-4, -0xd.04045p-16, -0x8.fb297p-8 }, /* 50 */
+ { 0x1.4203aep+7, 0x1.4203cap+7, 0x1.4203e6p+7, -0xb.4f158p-28, -0x1.018f52p-4, 0xc.ccf6fp-16, 0x1.4d5dp-4 }, /* 51 */
+ { 0x1.484c38p+7, 0x1.484c46p+7, 0x1.484c56p+7, -0x6.5a89c8p-24, 0xf.f155p-8, -0xc.5d21dp-16, -0xd.aca34p-8 }, /* 52 */
+ { 0x1.4e94b8p+7, 0x1.4e94c4p+7, 0x1.4e94d4p+7, -0x1.ef16c8p-24, -0xf.cad3fp-8, 0xb.d75f8p-16, 0x1.f36732p-4 }, /* 53 */
+ { 0x1.54dd36p+7, 0x1.54dd4p+7, 0x1.54dd52p+7, -0x6.1e7b68p-24, 0xf.a564cp-8, -0xb.ec1cfp-16, 0xe.e7421p-8 }, /* 54 */
+ { 0x1.5b25aep+7, 0x1.5b25bep+7, 0x1.5b25d4p+7, -0xf.8c858p-28, -0xf.80faep-8, 0xb.8b6c5p-16, -0x5.835ed8p-8 }, /* 55 */
+ { 0x1.616e34p+7, 0x1.616e3cp+7, 0x1.616e4ep+7, 0x7.75d858p-24, 0xf.5d8abp-8, -0xb.b3779p-16, 0x2.40b948p-4 }, /* 56 */
+ { 0x1.67b6bp+7, 0x1.67b6b8p+7, 0x1.67b6dp+7, 0x1.d78632p-24, -0xf.3b096p-8, 0xa.daf89p-16, 0x1.aa8548p-8 }, /* 57 */
+ { 0x1.6dff28p+7, 0x1.6dff36p+7, 0x1.6dff54p+7, 0x3.b24794p-24, 0xf.196c7p-8, -0xb.1afe1p-16, -0x1.77538cp-8 }, /* 58 */
+ { 0x1.7447a2p+7, 0x1.7447b2p+7, 0x1.7447cap+7, 0x6.39cbc8p-24, -0xe.f8aa5p-8, 0xa.50daap-16, 0x1.9592c2p-8 }, /* 59 */
+ { 0x1.7a902p+7, 0x1.7a903p+7, 0x1.7a903ep+7, -0x1.820e3ap-24, 0xe.d8b9dp-8, -0xa.998cp-16, -0x2.c35d78p-4 }, /* 60 */
+ { 0x1.80d89ep+7, 0x1.80d8aep+7, 0x1.80d8bep+7, -0x2.c7e038p-24, -0xe.b9925p-8, 0x9.ce06p-16, -0x2.2b3054p-4 }, /* 61 */
+ { 0x1.87211cp+7, 0x1.87212cp+7, 0x1.872144p+7, 0x6.ab31c8p-24, 0xe.9b2bep-8, -0x9.4de7p-16, -0x1.32cb5ep-4 }, /* 62 */
+ { 0x1.8d699ap+7, 0x1.8d69a8p+7, 0x1.8d69bp+7, 0x4.4ef25p-24, -0xe.7d7ecp-8, 0x9.a0f1ep-16, 0x1.6ac076p-4 }, /* 63 */
+};
+
+/* Formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf:
+ y0(x) ~ sqrt(2/(pi*x))*beta0(x)*sin(x-pi/4-alpha0(x))
+ where beta0(x) = 1 - 1/(16*x^2) + 53/(512*x^4)
+ and alpha0(x) = 1/(8*x) - 25/(384*x^3). */
+static float
+y0f_asympt (float x)
+{
+ /* The following code fails to give an error <= 9 ulps in only two cases,
+ for which we tabulate the correctly-rounded result. */
+ if (x == 0x1.bfad96p+7f)
+ return -0x7.f32bdp-32f;
+ if (x == 0x1.2e2a42p+17f)
+ return 0x1.a48974p-40f;
+ double y = 1.0 / (double) x;
+ double y2 = y * y;
+ double beta0 = 1.0f + y2 * (-0x1p-4f + 0x1.a8p-4 * y2);
+ double alpha0 = y * (0x2p-4 - 0x1.0aaaaap-4 * y2);
+ double h;
+ int n;
+ h = reduce_aux (x, &n, alpha0);
+ /* Now x - pi/4 - alpha0 = h + n*pi/2 mod (2*pi). */
+ float xr = (float) h;
+ n = n & 3;
+ float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */
+ float t = cst / sqrtf (x) * (float) beta0;
+ if (n == 0)
+ return t * sinf (xr);
+ else if (n == 2) /* sin(x+pi) = -sin(x) */
+ return -t * sinf (xr);
+ else if (n == 1) /* sin(x+pi/2) = cos(x) */
+ return t * cosf (xr);
+ else /* sin(x+3pi/2) = -cos(x) */
+ return -t * cosf (xr);
+}
+
+/* Special code for x near a root of y0.
+ z is the value computed by the generic code.
+ For small x, use a polynomial approximating y0 around its root.
+ For large x, use an asymptotic formula (y0f_asympt). */
+static float
+y0f_near_root (float x, float z)
+{
+ float index_f;
+ int index;
+
+ index_f = roundf ((x - FIRST_ZERO_Y0) / (float) M_PI);
+ if (index_f >= SMALL_SIZE)
+ return y0f_asympt (x);
+ index = (int) index_f;
+ const float *p = Py[index];
+ float x0 = p[0];
+ float x1 = p[2];
+ /* If not in the interval [x0,x1] around xmid, return the value z. */
+ if (! (x0 <= x && x <= x1))
+ return z;
+ float xmid = p[1];
+ float y = x - xmid;
+ /* For degree 0 use a degree-5 polynomial, where the coefficients of
+ degree 4 and 5 are hard-coded. */
+ float p6 = (index > 0) ? p[6] : p[6] + y * (-0x3.a46c9p-4 + y * 0x3.735478p-4);
+ float res = p[3] + y * (p[4] + y * (p[5] + y * p6));
+ return res;
+}
+
float
__ieee754_y0f(float x)
{
@@ -124,7 +452,8 @@ __ieee754_y0f(float x)
if(ix>=0x7f800000) return one/(x+x*x);
if(ix==0) return -1/zero; /* -inf and divide by zero exception. */
if(hx<0) return zero/(zero*x);
- if(ix >= 0x40000000) { /* |x| >= 2.0 */
+ if(ix >= 0x40000000 || (0x3f5340ed <= ix && ix <= 0x3f77b5e5)) {
+ /* |x| >= 2.0 or 0x1.a681dap-1 <= |x| <= 0x1.ef6bcap-1 (around 1st zero) */
/* y0(x) = sqrt(2/(pi*x))*(p0(x)*sin(x0)+q0(x)*cos(x0))
* where x0 = x-pi/4
* Better formula:
@@ -136,6 +465,7 @@ __ieee754_y0f(float x)
* sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x))
* to compute the worse one.
*/
+ SET_RESTORE_ROUNDF (FE_TONEAREST);
__sincosf (x, &s, &c);
ss = s-c;
cc = s+c;
@@ -143,17 +473,25 @@ __ieee754_y0f(float x)
* j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x)
* y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x)
*/
- if(ix<0x7f000000) { /* make sure x+x not overflow */
- z = -__cosf(x+x);
- if ((s*c)<zero) cc = z/ss;
- else ss = z/cc;
- }
- if(ix>0x5c000000) z = (invsqrtpi*ss)/sqrtf(x);
- else {
- u = pzerof(x); v = qzerof(x);
- z = invsqrtpi*(u*ss+v*cc)/sqrtf(x);
- }
- return z;
+ if (ix >= 0x7f000000) /* x >= 2^127: use asymptotic expansion. */
+ return y0f_asympt (x);
+ /* Now we are sure that x+x cannot overflow. */
+ z = -__cosf(x+x);
+ if ((s*c)<zero) cc = z/ss;
+ else ss = z/cc;
+ if (ix <= 0x5c000000)
+ {
+ u = pzerof(x); v = qzerof(x);
+ ss = u*ss+v*cc;
+ }
+ z = (invsqrtpi*ss)/sqrtf(x);
+ /* The following threshold is optimal (determined on
+ aarch64-linux-gnu). */
+ float threshold = 0x1.be585ap-4;
+ if (fabsf (ss) > threshold)
+ return z;
+ else
+ return y0f_near_root (x, z);
}
if(ix<=0x39800000) { /* x < 2**-13 */
return(u00 + tpi*__ieee754_logf(x));
@@ -165,7 +503,7 @@ __ieee754_y0f(float x)
}
libm_alias_finite (__ieee754_y0f, __y0f)
-/* The asymptotic expansions of pzero is
+/* The asymptotic expansion of pzero is
* 1 - 9/128 s^2 + 11025/98304 s^4 - ..., where s = 1/x.
* For x >= 2, We approximate pzero by
* pzero(x) = 1 + (R/S)
@@ -257,7 +595,7 @@ pzerof(float x)
}
-/* For x >= 8, the asymptotic expansions of qzero is
+/* For x >= 8, the asymptotic expansion of qzero is
* -1/8 s + 75/1024 s^3 - ..., where s = 1/x.
* We approximate pzero by
* qzero(x) = s*(-1.25 + (R/S))