diff mbox series

Fix the inaccuracy of y0f (BZ 14471)

Message ID	mwim7u2phn.fsf@tomate.loria.fr
State	Superseded
Headers	DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org A9F93384C003 Date: Mon, 18 Jan 2021 17:12:36 +0100 Message-Id: <mwim7u2phn.fsf@tomate.loria.fr> From: Paul Zimmermann <Paul.Zimmermann@inria.fr> To: libc-alpha@sourceware.org Subject: Fix the inaccuracy of y0f (BZ 14471) Precedence: list Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces@sourceware.org>
Series	Fix the inaccuracy of y0f (BZ 14471) \| Fix the inaccuracy of y0f (BZ 14471)

Commit Message

Paul Zimmermann Jan. 18, 2021, 4:12 p.m. UTC

  The largest error for all binary32 inputs is now less than 9.5 ulps with
respect to the infinite precision result, i.e., less than 9 ulps after
rounding, which is the largest error allowed. (This is for rounding to
nearest, for other rounding modes, the new code should not behave worse than
the old one.)

The new code is enabled only when there is a cancellation at the very end of
the y0f computation, thus should not give any visible slowdown on average.
When there is a cancellation, two different algorithms are used:

* around the first 64 zeros of y0, approximation polynomials of degree 3 are
  used, which were computed using the Sollya tool (https://www.sollya.org/)

* for large inputs, an asymptotic formula is used from [1]

[1] Fast and Accurate Bessel Function Computation,
John Harrison, Proceedings of Arith 19, 2009.

The largest error is now obtained for the following input:

libm wrong by up to 9.49e+00 ulp(s) for x=0x1.3d4e56p+6
y0f     gives 0x1.b42876p-16
mpfr_y0 gives 0x1.b42864p-16

Note: that patch is independent of the one for j0f [2]. If both are accepted,
a merge will be needed since they modify the same file and some routines can
be shared, but I prefer to submit them in two patches so that they can be
reviewed independently.

[2] https://sourceware.org/pipermail/libc-alpha/2021-January/121694.html
---
 math/auto-libm-test-in            |   3 +-
 math/auto-libm-test-out-y0        |  25 ++
 sysdeps/ieee754/flt-32/e_j0f.c    | 364 ++++++++++++++++++++++++++++--
 sysdeps/x86_64/fpu/libm-test-ulps |  14 +-
 4 files changed, 384 insertions(+), 22 deletions(-)

diff mbox series

Patch

diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
index 73840b8bef..78a8e5fd87 100644
--- a/math/auto-libm-test-in
+++ b/math/auto-libm-test-in
@@ -8225,8 +8225,9 @@  y0 0x1p-100
 y0 0x1p-110
 y0 0x1p-600
 y0 0x1p-10000
-# the next value generates larger error bounds on x86_64 (binary32)
+# the next values generate large error bounds on x86_64 (binary32)
 y0 0xd.3432bp-4
+y0 0x1.3d4e56p+6
 y0 min
 y0 min_subnorm
 
diff --git a/math/auto-libm-test-out-y0 b/math/auto-libm-test-out-y0
index 8ebb585170..ab92487da4 100644
--- a/math/auto-libm-test-out-y0
+++ b/math/auto-libm-test-out-y0
@@ -820,6 +820,31 @@  y0 0xd.3432bp-4
 = y0 tonearest ibm128 0xd.3432bp-4 : -0xf.fdd871793bc71f92d6b137b44p-8 : inexact-ok
 = y0 towardzero ibm128 0xd.3432bp-4 : -0xf.fdd871793bc71f92d6b137b44p-8 : inexact-ok
 = y0 upward ibm128 0xd.3432bp-4 : -0xf.fdd871793bc71f92d6b137b44p-8 : inexact-ok
+y0 0x1.3d4e56p+6
+= y0 downward binary32 0x4.f53958p+4 : 0x1.b42862p-16 : inexact-ok
+= y0 tonearest binary32 0x4.f53958p+4 : 0x1.b42864p-16 : inexact-ok
+= y0 towardzero binary32 0x4.f53958p+4 : 0x1.b42862p-16 : inexact-ok
+= y0 upward binary32 0x4.f53958p+4 : 0x1.b42864p-16 : inexact-ok
+= y0 downward binary64 0x4.f53958p+4 : 0x1.b428630651a17p-16 : inexact-ok
+= y0 tonearest binary64 0x4.f53958p+4 : 0x1.b428630651a17p-16 : inexact-ok
+= y0 towardzero binary64 0x4.f53958p+4 : 0x1.b428630651a17p-16 : inexact-ok
+= y0 upward binary64 0x4.f53958p+4 : 0x1.b428630651a18p-16 : inexact-ok
+= y0 downward intel96 0x4.f53958p+4 : 0x1.b428630651a175acp-16 : inexact-ok
+= y0 tonearest intel96 0x4.f53958p+4 : 0x1.b428630651a175acp-16 : inexact-ok
+= y0 towardzero intel96 0x4.f53958p+4 : 0x1.b428630651a175acp-16 : inexact-ok
+= y0 upward intel96 0x4.f53958p+4 : 0x1.b428630651a175aep-16 : inexact-ok
+= y0 downward m68k96 0x4.f53958p+4 : 0x1.b428630651a175acp-16 : inexact-ok
+= y0 tonearest m68k96 0x4.f53958p+4 : 0x1.b428630651a175acp-16 : inexact-ok
+= y0 towardzero m68k96 0x4.f53958p+4 : 0x1.b428630651a175acp-16 : inexact-ok
+= y0 upward m68k96 0x4.f53958p+4 : 0x1.b428630651a175aep-16 : inexact-ok
+= y0 downward binary128 0x4.f53958p+4 : 0x1.b428630651a175ac20abf8104eeap-16 : inexact-ok
+= y0 tonearest binary128 0x4.f53958p+4 : 0x1.b428630651a175ac20abf8104eebp-16 : inexact-ok
+= y0 towardzero binary128 0x4.f53958p+4 : 0x1.b428630651a175ac20abf8104eeap-16 : inexact-ok
+= y0 upward binary128 0x4.f53958p+4 : 0x1.b428630651a175ac20abf8104eebp-16 : inexact-ok
+= y0 downward ibm128 0x4.f53958p+4 : 0x1.b428630651a175ac20abf8104e8p-16 : inexact-ok
+= y0 tonearest ibm128 0x4.f53958p+4 : 0x1.b428630651a175ac20abf8104fp-16 : inexact-ok
+= y0 towardzero ibm128 0x4.f53958p+4 : 0x1.b428630651a175ac20abf8104e8p-16 : inexact-ok
+= y0 upward ibm128 0x4.f53958p+4 : 0x1.b428630651a175ac20abf8104fp-16 : inexact-ok
 y0 min
 = y0 downward binary32 0x4p-128 : -0x3.7ac89cp+4 : inexact-ok
 = y0 tonearest binary32 0x4p-128 : -0x3.7ac89cp+4 : inexact-ok
diff --git a/sysdeps/ieee754/flt-32/e_j0f.c b/sysdeps/ieee754/flt-32/e_j0f.c
index 5d29611eb7..42b87dcdba 100644
--- a/sysdeps/ieee754/flt-32/e_j0f.c
+++ b/sysdeps/ieee754/flt-32/e_j0f.c
@@ -112,6 +112,335 @@  v02  =  7.6006865129e-05, /* 0x389f65e0 */
 v03  =  2.5915085189e-07, /* 0x348b216c */
 v04  =  4.4111031494e-10; /* 0x2ff280c2 */
 
+#define FIRST_ZERO_Y0 0.893576f
+
+#define SMALL_SIZE 64
+
+/* The following table contains successive zeros of y0 and degree-3 polynomial
+   approximations of y0 around these zeros: Py[0] for the first zero (0.893576),
+   Py[1] for the second one (3.957678), and so one. Each line contains:
+              {x0, xmid, x1, p0, p1, p2, p3}
+   where [x0,x1] is the interval around the zero, xmid is the binary32 number closest
+   to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation polynomial.
+   Each polynomial was generated using Remez' algorithm on the interval [x0,x1]
+   around the corresponding zero where the error is larger than 9 ulps for the
+   default code below.
+   Degree 3 is enough to get an error less than 4 ulps.
+*/
+static float Py[SMALL_SIZE][7] = {
+  /* for the first root we use a degree-4 polynomial since degree 3 is not enough,
+     where the coefficient of degree 4 is hard-coded in y0f_near_root() */
+{0xd.bd613p-4,0xe.4c176p-4,0xe.e0897p-4,0x3.274468p-28,0xe.121b7p-4,-0x7.df8eap-4,0x3.88cc2p-4/*,-0x3.9ce95p-4*/},
+{0x3.f2af8cp+0,0x3.f52a68p+0,0x3.fa1fa4p+0,0xa.f1f83p-28,-0x6.70d098p-4,0xd.04dc4p-8,0xe.f2a5p-8},
+{0x7.1464ap+0,0x7.16077p+0,0x7.191dap+0,-0x5.e2a51p-28,0x4.cd3328p-4,-0x5.6bbb5p-8,-0xc.48cfbp-8},
+{0xa.37ec2p+0,0xa.38ebap+0,0xa.3ad94p+0,-0x1.4b0aeep-24,-0x3.fec6b8p-4,0x3.206ccp-8,0xa.72401p-8},
+{0xd.5bd7dp+0,0xd.5c70ep+0,0xd.5d94ap+0,-0x8.179d7p-28,0x3.7e6544p-4,-0x2.178554p-8,-0x9.35f5bp-8},
+{0x1.07f9aap+4,0x1.0803c8p+4,0x1.08170cp+4,-0x2.5b8078p-24,-0x3.24b868p-4,0x1.86265ep-8,0x8.51de2p-8},
+{0x1.3a3d44p+4,0x1.3a42cep+4,0x1.3a4d8ap+4,0x1.cd304ap-28,0x2.e189ecp-4,-0x1.2c673ap-8,-0x7.a4726p-8},
+{0x1.6c7d5ep+4,0x1.6c833p+4,0x1.6c99fp+4,-0x6.c63f1p-28,-0x2.acc9a8p-4,0xf.077a1p-12,0x7.1aba98p-8},
+{0x1.9ebec4p+4,0x1.9ec47p+4,0x1.9ed016p+4,0x1.e53838p-24,0x2.81f2p-4,-0xc.61ccp-12,-0x6.aaa99p-8},
+{0x1.d1008ep+4,0x1.d10644p+4,0x1.d10cf2p+4,0x1.636feep-24,-0x2.5e40dcp-4,0xa.6dfp-12,0x6.4cd5a8p-8},
+{0x2.0344f8p+4,0x2.034884p+4,0x2.034d4p+4,-0x4.04e1bp-28,0x2.3febd8p-4,-0x8.f0ff9p-12,-0x5.fcd088p-8},
+{0x2.358778p+4,0x2.358b1p+4,0x2.359224p+4,0x3.6063d8p-24,-0x2.25baacp-4,0x7.c6a088p-12,0x5.b78ff8p-8},
+{0x2.67ca1p+4,0x2.67cdd8p+4,0x2.67d434p+4,-0x3.f39ebcp-24,0x2.0ed04cp-4,-0x6.d7eaf8p-12,-0x5.7aeaa8p-8},
+{0x2.9a0d0cp+4,0x2.9a10dp+4,0x2.9a1828p+4,-0x4.ea23p-28,-0x1.fa8b4p-4,0x6.158438p-12,0x5.45324p-8},
+{0x2.cc51ecp+4,0x2.cc53e8p+4,0x2.cc580cp+4,-0x3.5df0c8p-24,0x1.e8727ep-4,-0x5.7460d8p-12,-0x5.1536p-8},
+{0x2.fe94f8p+4,0x2.fe972p+4,0x2.fe9b18p+4,0x1.1ef09ep-24,-0x1.d8293ap-4,0x4.ed6058p-12,0x4.e9fcc8p-8},
+{0x3.30d8b8p+4,0x3.30da7p+4,0x3.30debcp+4,0x1.b70cecp-24,0x1.c967p-4,-0x4.7ad838p-12,-0x4.c2c9d8p-8},
+{0x3.631b94p+4,0x3.631ddp+4,0x3.632244p+4,0x1.abaadcp-24,-0x1.bbf246p-4,0x4.187ba8p-12,0x4.9f09f8p-8},
+{0x3.955f7cp+4,0x3.956144p+4,0x3.9565fcp+4,0x1.63989ep-24,0x1.af9cb4p-4,-0x3.c397f8p-12,-0x4.7e4038p-8},
+{0x3.c7a2bp+4,0x3.c7a4c4p+4,0x3.c7a878p+4,-0x1.68a8ecp-24,-0x1.a4407ep-4,0x3.797fdcp-12,0x4.600d3p-8},
+{0x3.f9e62cp+4,0x3.f9e85p+4,0x3.f9ea7p+4,0x1.e1bb5p-24,0x1.99be74p-4,-0x3.38739cp-12,-0x4.441c5p-8},
+{0x4.2c2a1p+4,0x4.2c2be8p+4,0x4.2c2e7p+4,-0x5.5bbcfp-24,-0x1.8ffc9ap-4,0x2.ff0f5cp-12,0x4.2a266p-8},
+{0x4.5e6d98p+4,0x4.5e6f8p+4,0x4.5e71c8p+4,-0x4.9e34a8p-24,0x1.86e51cp-4,-0x2.cba284p-12,-0x4.11f568p-8},
+{0x4.90b17p+4,0x4.90b328p+4,0x4.90b5ap+4,0x1.966706p-24,-0x1.7e657p-4,0x2.9e0d44p-12,0x3.fb56a4p-8},
+{0x4.c2f59p+4,0x4.c2f6d8p+4,0x4.c2fac8p+4,0x3.4b3b68p-24,0x1.766dc2p-4,-0x2.752fbp-12,-0x3.e61fcp-8},
+{0x4.f53968p+4,0x4.f53a88p+4,0x4.f53cb8p+4,0x6.a99008p-28,-0x1.6ef07ep-4,0x2.501294p-12,0x3.d230ep-8},
+{0x5.277dp+4,0x5.277e4p+4,0x5.278108p+4,-0x7.a9fa6p-32,0x1.67e1dap-4,-0x2.2e9388p-12,-0x3.bf663cp-8},
+{0x5.59c0e8p+4,0x5.59c2p+4,0x5.59c398p+4,-0x5.026808p-24,-0x1.613798p-4,0x2.104558p-12,0x3.ada76cp-8},
+{0x5.8c0498p+4,0x5.8c05cp+4,0x5.8c0898p+4,0x4.46aa2p-24,0x1.5ae8c2p-4,-0x1.f474eep-12,-0x3.9cda1cp-8},
+{0x5.be48dp+4,0x5.be498p+4,0x5.be4aap+4,0x1.5cfccp-24,-0x1.54ed76p-4,0x1.dad812p-12,0x3.8cec8p-8},
+{0x5.f08c08p+4,0x5.f08d48p+4,0x5.f08ecp+4,-0xb.4dc4cp-28,0x1.4f3ebcp-4,-0x1.c38338p-12,-0x3.7dc9dp-8},
+{0x6.22d05p+4,0x6.22d11p+4,0x6.22d428p+4,0x3.f5343p-24,-0x1.49d668p-4,0x1.ade97cp-12,0x3.6f610cp-8},
+{0x6.55137p+4,0x6.5514ep+4,0x6.551638p+4,-0x6.e6f32p-28,0x1.44aefap-4,-0x1.9a2d3ep-12,-0x3.61a778p-8},
+{0x6.8757e8p+4,0x6.8758bp+4,0x6.8759c8p+4,0x1.f359c2p-28,-0x1.3fc386p-4,0x1.87d25cp-12,0x3.548be4p-8},
+{0x6.b99bp+4,0x6.b99c8p+4,0x6.b99e2p+4,-0x2.9de748p-24,0x1.3b0fa4p-4,-0x1.76b5aap-12,-0x3.48048p-8},
+{0x6.ebdfb8p+4,0x6.ebe058p+4,0x6.ebe1bp+4,-0x2.24ccc8p-24,-0x1.368f5cp-4,0x1.67061p-12,0x3.3c0608p-8},
+{0x7.1e2368p+4,0x7.1e243p+4,0x7.1e25bp+4,0x4.7dcea8p-24,0x1.323f16p-4,-0x1.5858c4p-12,-0x3.3087acp-8},
+{0x7.50673p+4,0x7.506808p+4,0x7.5069ap+4,-0x4.b538p-24,-0x1.2e1b98p-4,0x1.4a9624p-12,0x3.258078p-8},
+{0x7.82ab38p+4,0x7.82abep+4,0x7.82ad78p+4,0x3.09dc4cp-24,0x1.2a21ecp-4,-0x1.3da94p-12,-0x3.1ae88cp-8},
+{0x7.b4eeep+4,0x7.b4efb8p+4,0x7.b4f158p+4,0x4.d5a58p-28,-0x1.264f66p-4,0x1.317f8cp-12,0x3.10b8fcp-8},
+{0x7.e732cp+4,0x7.e73398p+4,0x7.e73548p+4,0x3.f4c44cp-24,0x1.22a192p-4,-0x1.265128p-12,-0x3.06eb08p-8},
+{0x8.1976bp+4,0x8.19777p+4,0x8.19783p+4,0x2.4beae8p-24,-0x1.1f1634p-4,0x1.1b7d2ap-12,0x2.fd7934p-8},
+{0x8.4bbbp+4,0x8.4bbb5p+4,0x8.4bbcep+4,-0xd.a581ep-28,0x1.1bab3cp-4,-0x1.1186d6p-12,-0x2.f45cep-8},
+{0x8.7dfe8p+4,0x8.7dff3p+4,0x8.7dffbp+4,0xa.7c0bdp-28,-0x1.185eccp-4,0x1.0819c2p-12,0x2.eb92ccp-8},
+{0x8.b042bp+4,0x8.b0431p+4,0x8.b043dp+4,-0x1.9452ecp-24,0x1.152f26p-4,-0xf.f2b59p-16,-0x2.e314bp-8},
+{0x8.e2868p+4,0x8.e286fp+4,0x8.e287cp+4,0x3.83b7b8p-24,-0x1.121ab2p-4,0xf.6b21p-16,0x2.dadf34p-8},
+{0x9.14ca8p+4,0x9.14cadp+4,0x9.14cbbp+4,-0x6.5ca3a8p-24,0x1.0f1ff2p-4,-0xe.ea544p-16,-0x2.d2ee28p-8},
+{0x9.470e6p+4,0x9.470ecp+4,0x9.470fp+4,-0x6.bb61ap-24,-0x1.0c3d8ap-4,0xe.7833fp-16,0x2.cb3e2cp-8},
+{0x9.79524p+4,0x9.7952ap+4,0x9.79539p+4,0x2.2438p-24,0x1.097236p-4,-0xe.03747p-16,-0x2.c3cb48p-8},
+{0x9.ab95cp+4,0x9.ab968p+4,0x9.ab971p+4,0x3.1e0054p-24,-0x1.06bcc8p-4,0xd.94272p-16,0x2.bc9328p-8},
+{0x9.ddd9fp+4,0x9.ddda7p+4,0x9.dddb5p+4,0x7.46c908p-24,0x1.041c28p-4,-0xd.320e5p-16,-0x2.b5920cp-8},
+{0xa.101d7p+4,0xa.101e5p+4,0xa.101f3p+4,-0xb.4f158p-28,-0x1.018f52p-4,0xc.cc7dfp-16,0x2.aec5f4p-8},
+{0xa.4261cp+4,0xa.42623p+4,0xa.4262bp+4,-0x6.5a89c8p-24,0xf.f155p-8,-0xc.6b5cbp-16,-0x2.a82be4p-8},
+{0xa.74a5cp+4,0xa.74a62p+4,0xa.74a6ap+4,-0x1.ef16c8p-24,-0xf.cad3fp-8,0xc.16478p-16,0x2.a1c1ap-8},
+{0xa.a6e9bp+4,0xa.a6eap+4,0xa.a6ea9p+4,-0x6.1e7b68p-24,0xf.a564cp-8,-0xb.bd1eap-16,-0x2.9b84f8p-8},
+{0xa.d92d7p+4,0xa.d92dfp+4,0xa.d92eap+4,-0xf.8c858p-28,-0xf.80faep-8,0xb.6f5dbp-16,0x2.9573dcp-8},
+{0xb.0b71ap+4,0xb.0b71ep+4,0xb.0b727p+4,0x7.75d858p-24,0xf.5d8abp-8,-0xb.24e88p-16,-0x2.8f8c4cp-8},
+{0xb.3db58p+4,0xb.3db5cp+4,0xb.3db68p+4,0x1.d78632p-24,-0xf.3b096p-8,0xa.d5ef1p-16,0x2.89cc8p-8},
+{0xb.6ff96p+4,0xb.6ff9bp+4,0xb.6ffaap+4,0x3.b24794p-24,0xf.196c7p-8,-0xa.918e2p-16,-0x2.8432cp-8},
+{0xb.a23d1p+4,0xb.a23d9p+4,0xb.a23e5p+4,0x6.39cbc8p-24,-0xe.f8aa5p-8,0xa.486fap-16,0x2.7ebd9p-8},
+{0xb.d481p+4,0xb.d4818p+4,0xb.d481cp+4,-0x1.820e3ap-24,0xe.d8b9dp-8,-0xa.0973fp-16,-0x2.796b4cp-8},
+{0xc.06c4fp+4,0xc.06c57p+4,0xc.06c5fp+4,-0x2.c7e038p-24,-0xe.b9925p-8,0x9.cce94p-16,0x2.743a5cp-8},
+{0xc.3908ep+4,0xc.39096p+4,0xc.390a2p+4,0x6.ab31c8p-24,0xe.9b2bep-8,-0x9.92ad2p-16,-0x2.6f2994p-8},
+{0xc.6b4cdp+4,0xc.6b4d4p+4,0xc.6b4d8p+4,0x4.4ef25p-24,-0xe.7d7ecp-8,0x9.5360fp-16,0x2.6a37dp-8},
+};
+
+/* argument reduction mod 2pi: T[i] ~ 2^i mod (2*pi) */
+static double T[128] = {
+0x1p+0,
+0x2p+0,
+-0x2.487ed5110b462p+0,
+0x1.b7812aeef4b9fp+0,
+-0x2.d97c7f3321d24p+0,
+0x9.585d6aac7a1a8p-4,
+0x1.2b0bad558f435p+0,
+0x2.56175aab1e86ap+0,
+-0x1.9c501fbace38dp+0,
+0x3.0fde959b6ed46p+0,
+-0x2.8c1a9da2d9d3cp-4,
+-0x5.18353b45b3a78p-4,
+-0xa.306a768b674fp-4,
+-0x1.460d4ed16ce9ep+0,
+-0x2.8c1a9da2d9d3cp+0,
+0x1.304999cb579e8p+0,
+0x2.60933396af3dp+0,
+-0x1.87586de3accc3p+0,
+-0x3.0eb0dbc759986p+0,
+0x2.b1d1d8258155p-4,
+0x5.63a3b04b02aap-4,
+0xa.c74760960554p-4,
+0x1.58e8ec12c0aa8p+0,
+0x2.b1d1d8258155p+0,
+-0xe.4db24c6089bf8p-4,
+-0x1.c9b6498c1137fp+0,
+0x2.b51241f8e8d64p+0,
+-0xd.e5a511f3999bp-4,
+-0x1.bcb4a23e73336p+0,
+0x2.cf15909424df4p+0,
+-0xa.a53b3e8c1877p-4,
+-0x1.54a767d1830eep+0,
+-0x2.a94ecfa3061dcp+0,
+0xf.5e135caff0a78p-4,
+0x1.ebc26b95fe14fp+0,
+-0x2.70f9fde50f1c4p+0,
+0x1.668ad946ed0dap+0,
+0x2.cd15b28dda1b4p+0,
+-0xa.e536ff5570fbp-4,
+-0x1.5ca6dfeaae1f6p+0,
+-0x2.b94dbfd55c3ecp+0,
+0xd.5e3556652c89p-4,
+0x1.abc6aacca5912p+0,
+-0x2.f0f17f77c023ep+0,
+0x6.69bd6218afe64p-4,
+0xc.d37ac4315fcc8p-4,
+0x1.9a6f58862bf99p+0,
+-0x3.13a02404b352ep+0,
+0x2.13e8d07a4a046p-4,
+0x4.27d1a0f49408cp-4,
+0x8.4fa341e928118p-4,
+0x1.09f4683d25023p+0,
+0x2.13e8d07a4a046p+0,
+-0x2.20ad341c773d4p+0,
+0x2.07246cd81ccb8p+0,
+-0x2.3a35fb60d1af2p+0,
+0x1.d412de4f67e7ep+0,
+-0x2.a05918723b764p+0,
+0x1.07cca42c94599p+0,
+0x2.0f99485928b32p+0,
+-0x2.294c445eb9dfep+0,
+0x1.f5e64c5397865p+0,
+-0x2.5cb23c69dc396p+0,
+0x1.8f1a5c3d52d34p+0,
+0x3.1e34b87aa5a68p+0,
+-0xc.15641bbff90bp-8,
+-0x1.82ac8377ff216p-4,
+-0x3.055906effe42cp-4,
+-0x6.0ab20ddffc858p-4,
+-0xc.15641bbff90bp-4,
+-0x1.82ac8377ff216p+0,
+-0x3.055906effe42cp+0,
+0x3.dccc7310ec09ap-4,
+0x7.b998e621d8134p-4,
+0xf.7331cc43b0268p-4,
+0x1.ee6639887604dp+0,
+-0x2.6bb262001f3c6p+0,
+0x1.711a1110cccd4p+0,
+0x2.e2342221999a8p+0,
+-0x8.41690cdd81128p-4,
+-0x1.082d219bb0225p+0,
+-0x2.105a43376044ap+0,
+0x2.27ca4ea24abcep+0,
+-0x1.f8ea37cc75cc6p+0,
+0x2.56aa65781fad6p+0,
+-0x1.9b2a0a20cbeb6p+0,
+0x3.122ac0cf736f6p+0,
+-0x2.4295372246764p-4,
+-0x4.852a6e448cec8p-4,
+-0x9.0a54dc8919d9p-4,
+-0x1.214a9b91233b2p+0,
+-0x2.4295372246764p+0,
+0x1.c35466cc7e59ap+0,
+-0x2.c1d607780e92ep+0,
+0xc.4d2c620ee206p-4,
+0x1.89a58c41dc40cp+0,
+0x3.134b1883b8818p+0,
+-0x2.1e8a4099a4324p-4,
+-0x4.3d14813348648p-4,
+-0x8.7a29026690c9p-4,
+-0x1.0f45204cd2192p+0,
+-0x2.1e8a4099a4324p+0,
+0x2.0b6a53ddc2e18p+0,
+-0x2.31aa2d558583p+0,
+0x1.e52a7a6600402p+0,
+-0x2.7e29e0450ac5cp+0,
+0x1.4c2b1486f5ba8p+0,
+0x2.9856290deb75p+0,
+-0x1.17d282f5345bfp+0,
+-0x2.2fa505ea68b7ep+0,
+0x1.e934c93c39d64p+0,
+-0x2.761542989799ap+0,
+0x1.5c544fdfdc12fp+0,
+0x2.b8a89fbfb825ep+0,
+-0xd.72d95919afa58p-4,
+-0x1.ae5b2b2335f4bp+0,
+0x2.ebc87eca9f5cap+0,
+-0x7.0edd77bcc8ccp-4,
+-0xe.1dbaef799198p-4,
+-0x1.c3b75def3233p+0,
+0x2.c1101932a6e02p+0,
+-0xc.65ea2abbd85fp-4,
+-0x1.8cbd45577b0bep+0,
+-0x3.197a8aaef617cp+0,
+0x1.589bfb31f1687p-4,
+0x2.b137f663e2d0ep-4,
+0x5.626fecc7c5a1cp-4,
+0xa.c4dfd98f8b438p-4,
+};
+
+/* return h such that x - pi/4 mod (2*pi) ~ h */
+static double
+reduce_mod_twopi (double x)
+{
+  double sign = 1, h;
+  if (x < 0)
+    {
+      x = -x;
+      sign = -1;
+    }
+  h = 0; /* invariant is x+h mod (2*pi) */
+  while (x >= 4)
+    {
+      int i = ilogb (x);
+      x -= ldexp (1.0f, i);
+      h += T[i];
+    }
+  /* add the remainder x */
+  h += x;
+  /* subtract pi/4 */
+  double piover4 = 0xc.90fdaa22168cp-4;
+  h -= piover4;
+  /* reduce mod 2*pi */
+  double twopi = 0x6.487ed5110b46p+0;
+  while (fabs (h) > twopi * 0.5)
+    {
+      if (h > 0)
+        h -= twopi;
+      else
+        h += twopi;
+    }
+  return sign * h;
+}
+
+/* formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf:
+   y0(x) ~ sqrt(2/(pi*x))*beta0(x)*sin(x-pi/4-alpha0(x))
+   where beta0(x) = 1 - 16/x^2 + 53/(512*x^4)
+   and alpha0(x) = 1/(8*x) - 25/(384*x^3) */
+static float
+y0f_asympt (float x)
+{
+  /* the following code fails to give an error <= 9 ulps in only two cases,
+     for which we tabulate the correctly-rounded result */
+  if (x == 0x1.bfad96p+7f)
+    return -0x7.f32bdp-32f;
+  if (x == 0x1.2e2a42p+17f)
+    return 0x1.a48974p-40f;
+  double y = 1.0 / (double) x;
+  double y2 = y * y;
+  double beta0 = 1.0f + y2 * (-0x1p-4f + 0x1.a8p-4 * y2);
+  double alpha0 = y * (0x2p-4 - 0x1.0aaaaap-4 * y2);
+  double h;
+  h = reduce_mod_twopi ((double) x);
+  /* subtract alpha0 */
+  h -= alpha0;
+  /* now reduce mod pi/2 */
+  double piover2 = 0x1.921fb54442d18p+0;
+  int n = 0;
+  while (fabs (h) > piover2 / 2)
+    {
+      if (h > 0)
+        {
+          h -= piover2;
+          n ++;
+        }
+      else
+        {
+          h += piover2;
+          n --;
+        }
+    }
+  /* now x - pi/4 - alpha0 = h + n*pi/2 mod (2*pi) */
+  float xr = (float) h;
+  n = n & 3;
+  float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */
+  float t = cst / sqrtf (x) * (float) beta0;
+  if (n == 0)
+    return t * sinf (xr);
+  else if (n == 2) /* sin(x+pi) = -sin(x) */
+    return -t * sinf (xr);
+  else if (n == 1) /* sin(x+pi/2) = cos(x) */
+    return t * cosf (xr);
+  else             /* sin(x+3pi/2) = -cos(x) */
+    return -t * cosf (xr);
+}
+
+/* Special code for x near a root of y0.
+   z is the value computed by the generic code.
+   For small x, we use a polynomial approximating y0 around its root.
+   For large x, we use an asymptotic formula (y0f_asympt). */
+static float
+y0f_near_root (float x, float z)
+{
+  float index_f;
+  int index;
+  index_f = roundf ((x - FIRST_ZERO_Y0) / (float) M_PI);
+  if (index_f >= SMALL_SIZE)
+    return y0f_asympt (x);
+  index = (int) index_f;
+  float *p = Py[index];
+  float x0 = p[0];
+  float x1 = p[2];
+  /* if we are not in the interval [x0,x1] around xmid, we return the value z */
+  if (! (x0 <= x && x <= x1))
+    return z;
+  float xmid = p[1];
+  float y = x - xmid;
+  /* for degree 0 we use a degree-4 polynomial, where the coefficient of degree 4
+     is hard-coded here */
+  float p6 = (index > 0) ? p[6] : p[6] + y * -0x3.9ce95p-4;
+  return p[3] + y * (p[4] + y * (p[5] + y * p6));
+}
+
 float
 __ieee754_y0f(float x)
 {
@@ -124,7 +453,8 @@  __ieee754_y0f(float x)
 	if(ix>=0x7f800000) return  one/(x+x*x);
 	if(ix==0) return -1/zero; /* -inf and divide by zero exception.  */
 	if(hx<0) return zero/(zero*x);
-	if(ix >= 0x40000000) {  /* |x| >= 2.0 */
+	if(ix >= 0x40000000 || (0x3f5bd613 <= ix && ix <= 0x3f6e0897)) {
+          /* |x| >= 2.0 or 0x1.b7ac26p-1 <= |x| <= 0x1.dc112ep-1 (around 1st zero) */
 	/* y0(x) = sqrt(2/(pi*x))*(p0(x)*sin(x0)+q0(x)*cos(x0))
 	 * where x0 = x-pi/4
 	 *      Better formula:
@@ -143,17 +473,23 @@  __ieee754_y0f(float x)
 	 * j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x)
 	 * y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x)
 	 */
-		if(ix<0x7f000000) {  /* make sure x+x not overflow */
-		    z = -__cosf(x+x);
-		    if ((s*c)<zero) cc = z/ss;
-		    else            ss = z/cc;
-		}
-		if(ix>0x5c000000) z = (invsqrtpi*ss)/sqrtf(x);
-		else {
-		    u = pzerof(x); v = qzerof(x);
-		    z = invsqrtpi*(u*ss+v*cc)/sqrtf(x);
-		}
-		return z;
+                if (x >= 0x7f000000) /* x >= 2^127: use asymptotic expansion */
+                  return y0f_asympt (x);
+                /* now we are sure that x+x cannot overflow */
+                z = -__cosf(x+x);
+                if ((s*c)<zero) cc = z/ss;
+                else            ss = z/cc;
+                if (ix <= 0x5c000000)
+                  {
+                    u = pzerof(x); v = qzerof(x);
+                    ss = u*ss+v*cc;
+                  }
+                z = (invsqrtpi*ss)/sqrtf(x);
+                float magic = 0x1.8p+20; /* 2^20 + 2^19 */
+                if (magic + ss != magic) /* most likely */
+                  return z;
+                else /* |cc| <= 2^-4 */
+                  return y0f_near_root (x, z);
 	}
 	if(ix<=0x39800000) {	/* x < 2**-13 */
 	    return(u00 + tpi*__ieee754_logf(x));
@@ -165,7 +501,7 @@  __ieee754_y0f(float x)
 }
 libm_alias_finite (__ieee754_y0f, __y0f)
 
-/* The asymptotic expansions of pzero is
+/* The asymptotic expansion of pzero is
  *	1 - 9/128 s^2 + 11025/98304 s^4 - ...,	where s = 1/x.
  * For x >= 2, We approximate pzero by
  *	pzero(x) = 1 + (R/S)
@@ -257,7 +593,7 @@  pzerof(float x)
 }
 
 
-/* For x >= 8, the asymptotic expansions of qzero is
+/* For x >= 8, the asymptotic expansion of qzero is
  *	-1/8 s + 75/1024 s^3 - ..., where s = 1/x.
  * We approximate pzero by
  *	qzero(x) = s*(-1.25 + (R/S))
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 633d2ab8e4..a6b749e9b2 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -1748,10 +1748,10 @@  float128: 4
 ldouble: 5
 
 Function: "y0":
-double: 3
-float: 8
-float128: 3
-ldouble: 1
+double: 8
+float: 9
+float128: 4
+ldouble: 2
 
 Function: "y0_downward":
 double: 3
@@ -1760,15 +1760,15 @@  float128: 4
 ldouble: 5
 
 Function: "y0_towardzero":
-double: 3
+double: 5
 float: 3
 float128: 3
-ldouble: 6
+ldouble: 7
 
 Function: "y0_upward":
 double: 3
 float: 6
-float128: 3
+float128: 7
 ldouble: 5
 
 Function: "y1":