Patchwork Improves __ieee754_exp() performance by greater than 5x on sparc/x86.

login
register
mail settings
Submitter Patrick McGehearty
Date Oct. 16, 2017, 4:56 p.m.
Message ID <1508172962-97543-1-git-send-email-patrick.mcgehearty@oracle.com>
Download mbox | patch
Permalink /patch/23617/
State New
Headers show

Comments

Patrick McGehearty - Oct. 16, 2017, 4:56 p.m.
modified file:   sysdeps/ieee754/dbl-64/e_exp.c

These changes will be active for all platforms that don't provide
their own exp() routines. They will also be active for the ieee754
versions of ccos, ccosh, cosh, csin, csinh, sinh, exp10, gamma, and
erf which call __ieee754_exp() directly or indirectly.

Typical performance gains as measured on Sparc s7 testing common
values between exp(1) and exp(40) is typically around 5x.

Using the glibc perf tests on sparc and x86_64,
      sparc (nsec)    x86 (nsec)
      old     new     old     new
max   17629   400    5173     802
min     399    64      15      15
mean   5317   211    1349      29

The extreme max times for the old (ieee754) exp are due to the
multiprecision computation in the old algorithm when the true value is
very near 0.5 ulp away from an value representable in double
precision. The new algorithm does not take special measures for those
cases. The current glibc exp perf tests overrepresent those values.
Informal testing suggests approximately one in 200 cases might
invoke the high cost computation. The performance advantage of the new
algorithm for other values is still large but not as large as indicated
by the chart above.

Glibc correctness tests for exp() and expf() were run on sparc and x86_64.
The results match on both platforms. Within the test suite,
3 input values were found to cause 1 bit differences (ulp)
when "FE_TONEAREST" rounding mode is set. No differences were
seen for the tested values for the other rounding modes.
Typical example:
exp(-0x1.760cd2p+0)  (-1.46113312244415283203125)
 new code:    2.31973271630014299393707e-01   0x1.db14cd799387ap-3
 old code:    2.31973271630014271638132e-01   0x1.db14cd7993879p-3
    exp    =  2.31973271630014285508337 (high precision)
Old delta: off by 0.49 ulp
New delta: off by 0.51 ulp

In addition, because ieee754_exp() is used by other routines, cexp()
showed test results with very small imaginary input values where the
imaginary portion of the result was off by 3 ulp when in upward
rounding mode, but not in the other rounding modes.
---
 sysdeps/ieee754/dbl-64/e_exp.c |  618 +++++++++++++++++++++++++++-------------
 1 files changed, 416 insertions(+), 202 deletions(-)
Joseph Myers - Oct. 18, 2017, 5:22 p.m.
On Mon, 16 Oct 2017, Patrick McGehearty wrote:

> diff --git a/sysdeps/ieee754/dbl-64/e_exp.c b/sysdeps/ieee754/dbl-64/e_exp.c
> index 6757a14..555a47f 100644
> --- a/sysdeps/ieee754/dbl-64/e_exp.c
> +++ b/sysdeps/ieee754/dbl-64/e_exp.c
> @@ -1,238 +1,452 @@
> +/* EXP function - Compute double precision exponential
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
>  /*
> - * IBM Accurate Mathematical Library
> - * written by International Business Machines Corp.
> - * Copyright (C) 2001-2017 Free Software Foundation, Inc.

The file still contains __exp1, used by pow.  Thus, I do not think 
removing the existing copyright dates is appropriate unless as a 
preliminary patch you move __exp1 to another file (e_pow.c being the 
obvious one; watch out for the sysdeps/x86_64/fpu/multiarch/e_exp-*.c with 
their own macro definitions of __exp1 that would no longer be appropriate 
after such a move).

Does this patch remove all calls to __slowexp?  If so, I'd expect 
slowexp.c to be removed as part of the patch (including 
architecture-specific / multiarch variants, and Makefile references to 
special options etc. for building slowexp.c).


> +extern double __ieee754_exp (double);
> +
> +
> +static const double TBL[] = {
> +  1.00000000000000000000e+00, 0.00000000000000000000e+00,
> +  1.02189714865411662714e+00, 5.10922502897344389359e-17,
> +  1.04427378242741375480e+00, 8.55188970553796365958e-17,

As per previous comments, this sort of table of precomputed values should 
use hex float constants.  You can use before-and-after comparison of 
object files to make sure the hex floats do represent the same values as 
the decimal constants.

> +static const double
> +  half		=0.5,
> +/* Following three values used to scale x to primary range */
> +  invln2_32	=4.61662413084468283841e+01,	/* 0x40471547, 0x652b82fe */
> +  ln2_32hi	=2.16608493865351192653e-02,	/* 0x3f962e42, 0xfee00000 */
> +  ln2_32lo	=5.96317165397058656257e-12,	/* 0x3d9a39ef, 0x35793c76 */
> +/* t2-t5 terms used for polynomial computation */
> +  t2		=1.6666666666526086527e-1,	/* 3fc5555555548f7c */
> +  t3		=4.1666666666226079285e-2,	/* 3fa5555555545d4e */
> +  t4		=8.3333679843421958056e-3,	/* 3f811115b7aa905e */
> +  t5		=1.3888949086377719040e-3,	/* 3f56c1728d739765 */
> +  one		=1.0,
> +/* maximum value for x to not overflow */
> +  threshold1	=7.09782712893383973096e+02,	/* 0x40862E42, 0xFEFA39EF */
> +/* maximum value for -x to not underflow */
> +  threshold2	=7.45133219101941108420e+02,	/* 0x40874910, 0xD52D3051 */
> +/* scaling factor used when result near zero*/
> +  twom54	=5.55111512312578270212e-17,	/* 0x3c900000, 0x00000000 */
> +/* value used to force inexact condition */
> +  small		=1.0e-100;

Likewise hex floats for these values (other than half / one / small).  
Also, the formatting is far from GNU standard; I'd expect e.g.:

/* Scaling factor used when result near zero.  */
static const double twom54 = 0x1p-54;

(single space either side of =, comment before each variable definition 
starting with a capital letter and ending with ".  ").

> +	  /* 
> +	     Use FE_TONEAREST rounding mode for computing yy.y 
> +	     Avoid set/reset of rounding mode if already in FE_TONEAREST mode
> +	  */
> +	  fe_val = __fegetround();
> +	  if (fe_val == FE_TONEAREST) {
> +	    t = xx.x * xx.x;
> +	    yy.y = xx.x + (t * (half + xx.x * t2) +
> +			   (t * t) * (t3 + xx.x * t4 + t * t5));
> +	    retval = one + yy.y;
> +	  } else {
> +	    __fesetround(FE_TONEAREST);
> +	    t = xx.x * xx.x;
> +	    yy.y = xx.x + (t * (half + xx.x * t2) +
> +			   (t * t) * (t3 + xx.x * t4 + t * t5));
> +	    retval = one + yy.y;
> +	    __fesetround(fe_val);

The formatting here is off, missing space before '(' in function calls.  
And you should be using the optimized SET_RESTORE_ROUND (FE_TONEAREST), 
which in addition to avoiding setting the rounding mode to a value it 
already has, also e.g. arranges on x86_64 to set only the SSE rounding 
mode and not the x87 mode because only the SSE mode needs setting for 
types other than long double.

(I'm not clear why you actually need to set the rounding mode here.  Does 
excessive inaccuracy result in this particular case if you don't set it?)

> +      fe_val = __fegetround();

Likewise again, and subsequently.

> +      if (fe_val == FE_TONEAREST) {
> +	z = xx.x - TBL2[j];
> +	t = z * z;
> +	/* the "small" term below guarantees inexact will be raised */

Guaranteeing "inexact" is not part of the goals for most libm functions, 
so I expect you can remove that term.

> +      if (-xx.x > threshold2)
> +	{			/* set underflow error condition */
> +	  double force_underflow = tiny * tiny;
> +	  math_force_eval (force_underflow);
> +	  retval = zero;
> +	  return retval;

I'd expect the force_underflow value to be returned in this case (so the 
return value is appropriate in round-upward mode).
Joseph Myers - Oct. 18, 2017, 11:22 p.m.
On Wed, 18 Oct 2017, Joseph Myers wrote:

> Does this patch remove all calls to __slowexp?  If so, I'd expect 
> slowexp.c to be removed as part of the patch (including 
> architecture-specific / multiarch variants, and Makefile references to 
> special options etc. for building slowexp.c).

Just to add: one of the less obvious places needing updating on removing 
slowexp.c is the manual/probes.texi documentation of the probe points 
therein.
Patrick McGehearty - Oct. 19, 2017, 10:31 p.m.
On 10/18/2017 12:22 PM, Joseph Myers wrote:

Thank you Joseph for your detailed review.
Comments below.
> On Mon, 16 Oct 2017, Patrick McGehearty wrote:
>
>> diff --git a/sysdeps/ieee754/dbl-64/e_exp.c b/sysdeps/ieee754/dbl-64/e_exp.c
>> index 6757a14..555a47f 100644
>> --- a/sysdeps/ieee754/dbl-64/e_exp.c
>> +++ b/sysdeps/ieee754/dbl-64/e_exp.c
>> @@ -1,238 +1,452 @@
>> +/* EXP function - Compute double precision exponential
>> +   Copyright (C) 2017 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>>   /*
>> - * IBM Accurate Mathematical Library
>> - * written by International Business Machines Corp.
>> - * Copyright (C) 2001-2017 Free Software Foundation, Inc.
> The file still contains __exp1, used by pow.  Thus, I do not think
> removing the existing copyright dates is appropriate unless as a
> preliminary patch you move __exp1 to another file (e_pow.c being the
> obvious one; watch out for the sysdeps/x86_64/fpu/multiarch/e_exp-*.c with
> their own macro definitions of __exp1 that would no longer be appropriate
> after such a move).
For the copyright issue, would it be appropriate to move the
previous copyright notice to right before __exp1?
I'm reluctant to move __exp1 as that might also require changes
to Makefiles which are not currently required.


>
> Does this patch remove all calls to __slowexp?  If so, I'd expect
> slowexp.c to be removed as part of the patch (including
> architecture-specific / multiarch variants, and Makefile references to
> special options etc. for building slowexp.c).
For slowexp, I have some reservations that removing slowexp.o
might cause older object modules to break due to the missing
reference. I know they should not be directly referencing
an internal function, but still...
Anyway, I can't find any other direct usage of __slowexp.
I will remove all references to __slowexp and see
if anything breaks to show I missed a reference.

I find the following files have references, including some more
files to be removed.

sysdeps/generic/math_private.h

manual/probes.texi - simply remove references to slowexp_p6 and slowexp_p32
math/Makefile - remove slowexp references
sysdeps/generic/math_private.h
sysdeps/ieee754/dbl-64/e_exp.c   (removed in the new code)
sysdeps/ieee754/dbl-64/slowexp.c (file to be removed)
sysdeps/ieee754/dbl-64/e_pow.c:  (comment only)

The following include ieee754/e_exp.c; can remove references to slowexp
sysdeps/x86_64/fpu/multiarch/e_exp-avx.c
sysdeps/x86_64/fpu/multiarch/e_exp-fma.c
sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c

The following include ieee754/dbl-64/slowexp.c; no longer needed
sysdeps/x86_64/fpu/multiarch/slowexp-avx.c
sysdeps/x86_64/fpu/multiarch/slowexp-fma.c
sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c

sysdeps/x86_64/fpu/multiarch/Makefile - remove slowexp references


benchtests/strcol1-inputs/filelist#C and filelist#en_US.UTF-8
have references to slowexp*.c
I'm supposing those should also be removed.

>
>
>> +extern double __ieee754_exp (double);
>> +
>> +
>> +static const double TBL[] = {
>> +  1.00000000000000000000e+00, 0.00000000000000000000e+00,
>> +  1.02189714865411662714e+00, 5.10922502897344389359e-17,
>> +  1.04427378242741375480e+00, 8.55188970553796365958e-17,
> As per previous comments, this sort of table of precomputed values should
> use hex float constants.  You can use before-and-after comparison of
> object files to make sure the hex floats do represent the same values as
> the decimal constants.
>
>> +static const double
>> +  half		=0.5,
>> +/* Following three values used to scale x to primary range */
>> +  invln2_32	=4.61662413084468283841e+01,	/* 0x40471547, 0x652b82fe */
>> +  ln2_32hi	=2.16608493865351192653e-02,	/* 0x3f962e42, 0xfee00000 */
>> +  ln2_32lo	=5.96317165397058656257e-12,	/* 0x3d9a39ef, 0x35793c76 */
>> +/* t2-t5 terms used for polynomial computation */
>> +  t2		=1.6666666666526086527e-1,	/* 3fc5555555548f7c */
>> +  t3		=4.1666666666226079285e-2,	/* 3fa5555555545d4e */
>> +  t4		=8.3333679843421958056e-3,	/* 3f811115b7aa905e */
>> +  t5		=1.3888949086377719040e-3,	/* 3f56c1728d739765 */
>> +  one		=1.0,
>> +/* maximum value for x to not overflow */
>> +  threshold1	=7.09782712893383973096e+02,	/* 0x40862E42, 0xFEFA39EF */
>> +/* maximum value for -x to not underflow */
>> +  threshold2	=7.45133219101941108420e+02,	/* 0x40874910, 0xD52D3051 */
>> +/* scaling factor used when result near zero*/
>> +  twom54	=5.55111512312578270212e-17,	/* 0x3c900000, 0x00000000 */
>> +/* value used to force inexact condition */
>> +  small		=1.0e-100;
> Likewise hex floats for these values (other than half / one / small).
> Also, the formatting is far from GNU standard; I'd expect e.g.:
>
> /* Scaling factor used when result near zero.  */
> static const double twom54 = 0x1p-54;
>
> (single space either side of =, comment before each variable definition
> starting with a capital letter and ending with ".  ").

Table of hex float constants. I can readily adjust the formating. What
you see is the formating used in the original source.
I've been uncomfortable with hex floats approach
as it only works for ieee754 representations
that use base 2. I admit that is most current machines.
And the prior ieee754 exp table uses hex format.
My second reason for resisting the change is my philosophy
when porting code is that every change without a good reason
is an opportunity to introduce errors without corresponding benefit.

If you feel strongly about suing hex constants, I will make an effort
to convert these values to hex format.  This conversion seems likely
to require the most effort on my part.


>
>> +	  /*
>> +	     Use FE_TONEAREST rounding mode for computing yy.y
>> +	     Avoid set/reset of rounding mode if already in FE_TONEAREST mode
>> +	  */
>> +	  fe_val = __fegetround();
>> +	  if (fe_val == FE_TONEAREST) {
>> +	    t = xx.x * xx.x;
>> +	    yy.y = xx.x + (t * (half + xx.x * t2) +
>> +			   (t * t) * (t3 + xx.x * t4 + t * t5));
>> +	    retval = one + yy.y;
>> +	  } else {
>> +	    __fesetround(FE_TONEAREST);
>> +	    t = xx.x * xx.x;
>> +	    yy.y = xx.x + (t * (half + xx.x * t2) +
>> +			   (t * t) * (t3 + xx.x * t4 + t * t5));
>> +	    retval = one + yy.y;
>> +	    __fesetround(fe_val);
> The formatting here is off, missing space before '(' in function calls.
> And you should be using the optimized SET_RESTORE_ROUND (FE_TONEAREST),
> which in addition to avoiding setting the rounding mode to a value it
> already has, also e.g. arranges on x86_64 to set only the SSE rounding
> mode and not the x87 mode because only the SSE mode needs setting for
> types other than long double.
>
> (I'm not clear why you actually need to set the rounding mode here.  Does
> excessive inaccuracy result in this particular case if you don't set it?)
>
>> +      fe_val = __fegetround();
Failing to use __fegetround(),__fesetround() causes over 40 math test
accuracy failures for other rounding modes in exp, cexp, cpow, cosh,
ccos, ccosh, csin, and csinh.  If the glibc/Linux community were to
declare that the only rounding mode that was fully supported was
FE_TONEAREST, we could simplify/speed up a lot of code. :-)
If I were wearing a benchmarker's hat, I could argue for that
approach. As a math librarian developer, I felt compelled to add the
rounding mode tests to maintain accuracy for the other rounding modes
in spite of the performance cost.

While SET_RESTORE_ROUND is reasonably optimized, it is not ideal,
especially on Sparc. Empirically, using SET_RESTORE_ROUND adds more
than twice the overhead compared to using __fegetround() on Sparc.
Interestingly, for x86, the two approaches have similar performance
perhaps due to more attention paid to x86 optimization with gcc over
the years.

Underneath the definitions of SET_RESTORE_ROUND, it ultimately relies
on __fegetround() and __fesetround(). The extra cost for
SET_RESTORE_ROUND is that it requires a flag to always be set (mode
changed or did not change). A short time the flag must tested to
determine if the rounding mode needs to be restored. If the compiler
puts that flag on the stack or in memory, the reading of a value that
was just written to cache triggers a "Read after Write" HW hazard,
causing a typical delay of 30-40 cycles. Avoiding the test also
avoids a possible mis-predicted branch.  For M7 and earlier, Sparc
branch prediction is not ideal and mis-prediction is expensive.  The
code I provided avoids the need to set the flag or test it by
duplicating small segments of code for each case.


> Likewise again, and subsequently.
>
>> +      if (fe_val == FE_TONEAREST) {
>> +	z = xx.x - TBL2[j];
>> +	t = z * z;
>> +	/* the "small" term below guarantees inexact will be raised */
> Guaranteeing "inexact" is not part of the goals for most libm functions,
> so I expect you can remove that term.
The "inexact" test was required to pass the (make check) math lib tests.

>
>> +      if (-xx.x > threshold2)
>> +	{			/* set underflow error condition */
>> +	  double force_underflow = tiny * tiny;
>> +	  math_force_eval (force_underflow);
>> +	  retval = zero;
>> +	  return retval;
> I'd expect the force_underflow value to be returned in this case (so the
> return value is appropriate in round-upward mode).
When -xx.x > threshold2, e**x = 0.  I'll investigate when/whether 
round-upward
expects the ulp to be 1 instead of zero. My first impression is that you
are right, but I want to think about it a bit more.
Joseph Myers - Oct. 19, 2017, 10:48 p.m.
On Thu, 19 Oct 2017, Patrick McGehearty wrote:

> For the copyright issue, would it be appropriate to move the
> previous copyright notice to right before __exp1?
> I'm reluctant to move __exp1 as that might also require changes
> to Makefiles which are not currently required.

There should be exactly one copyright notice with a given copyright holder 
in a given source file, so only one set of dates (2001-2017) given.  The 
"IBM Accurate Mathematical Library" comment might be updated to describe 
what parts of the file come from where.

> For slowexp, I have some reservations that removing slowexp.o
> might cause older object modules to break due to the missing
> reference. I know they should not be directly referencing
> an internal function, but still...
> Anyway, I can't find any other direct usage of __slowexp.
> I will remove all references to __slowexp and see
> if anything breaks to show I missed a reference.
> 
> I find the following files have references, including some more
> files to be removed.

This list seems to be missing (at least) 
sysdeps/powerpc/power4/fpu/Makefile (CPPFLAGS-slowexp.c setting).

> benchtests/strcol1-inputs/filelist#C and filelist#en_US.UTF-8
> have references to slowexp*.c
> I'm supposing those should also be removed.

I know some people do edit this benchmark input when removing files, but I 
think that's inappropriate; it's just a test input that happens to be 
based on a list of files in some version of glibc, there is no need for it 
to correspond to current glibc, and in the interests of comparability of 
benchmark results it would be best for it never to change.

> I've been uncomfortable with hex floats approach
> as it only works for ieee754 representations
> that use base 2. I admit that is most current machines.

This file is in a directory for such a binary format, and it's tuned for 
that particular format (regarding polynomial size chosen, etc.).  Any code 
written for decimal formats would naturally use decimal constants, but in 
code written for binary formats, hex constants are appropriate for such 
precomputed constants to make clear exactly what value / representation is 
intended.  (And in the 0x1p-54 case, it makes the code a lot more readable 
to put that in hex.)

> Underneath the definitions of SET_RESTORE_ROUND, it ultimately relies
> on __fegetround() and __fesetround(). The extra cost for
> SET_RESTORE_ROUND is that it requires a flag to always be set (mode
> changed or did not change). A short time the flag must tested to
> determine if the rounding mode needs to be restored. If the compiler
> puts that flag on the stack or in memory, the reading of a value that
> was just written to cache triggers a "Read after Write" HW hazard,
> causing a typical delay of 30-40 cycles. Avoiding the test also
> avoids a possible mis-predicted branch.  For M7 and earlier, Sparc
> branch prediction is not ideal and mis-prediction is expensive.  The
> code I provided avoids the need to set the flag or test it by
> duplicating small segments of code for each case.

Well, there are lots of lower-level interfaces such as libc_fesetround 
that could be used if appropriate.  Maybe additional such interfaces need 
to be added to support such cases of duplicating code.  It would seem 
natural to start with the existing interfaces (SET_RESTORE_ROUND), with a 
view to a followup possibly then adding further optimizations, just as the 
expf changes started by adding something wrapped by the existing wrapper, 
then arranged to avoid a wrapper in subsequent patches in the series.

> > Guaranteeing "inexact" is not part of the goals for most libm functions,
> > so I expect you can remove that term.
> The "inexact" test was required to pass the (make check) math lib tests.

You'll need to explain more.  For functions which are not fully defined by 
a binding to IEEE operations, both spurious and missing "inexact" should 
be allowed by the testsuite.
Szabolcs Nagy - Oct. 20, 2017, 11:41 a.m.
On 19/10/17 23:31, Patrick McGehearty wrote:
> Table of hex float constants. I can readily adjust the formating. What
> you see is the formating used in the original source.
> I've been uncomfortable with hex floats approach
> as it only works for ieee754 representations
> that use base 2. I admit that is most current machines.

your entire algorithm depends on ieee754 binary representation,
that's not a good reason for avoiding hexfloats.

decimal floats are not even required to be correctly rounded by
the compiler in iso c, they are only faithfully rounded, so
this is a portability bug in the original source too, you can
silently get completely wrong code generation because of it.

> And the prior ieee754 exp table uses hex format.
> My second reason for resisting the change is my philosophy
> when porting code is that every change without a good reason
> is an opportunity to introduce errors without corresponding benefit.
> 
> If you feel strongly about suing hex constants, I will make an effort
> to convert these values to hex format.  This conversion seems likely
> to require the most effort on my part.
>
Patrick McGehearty - Oct. 20, 2017, 2:56 p.m.
On 10/20/2017 6:41 AM, Szabolcs Nagy wrote:
> On 19/10/17 23:31, Patrick McGehearty wrote:
>> Table of hex float constants. I can readily adjust the formating. What
>> you see is the formating used in the original source.
>> I've been uncomfortable with hex floats approach
>> as it only works for ieee754 representations
>> that use base 2. I admit that is most current machines.
> your entire algorithm depends on ieee754 binary representation,
> that's not a good reason for avoiding hexfloats.
>
> decimal floats are not even required to be correctly rounded by
> the compiler in iso c, they are only faithfully rounded, so
> this is a portability bug in the original source too, you can
> silently get completely wrong code generation because of it.
>
>> And the prior ieee754 exp table uses hex format.
>> My second reason for resisting the change is my philosophy
>> when porting code is that every change without a good reason
>> is an opportunity to introduce errors without corresponding benefit.
>>
>> If you feel strongly about suing hex constants, I will make an effort
>> to convert these values to hex format.  This conversion seems likely
>> to require the most effort on my part.
>>
You make good points. In the prior context of the code being built
and used for Sparc/Solaris, the Studio compiler was known to
correctly round floating point constants since the early 90's.
In the gnu context, we have to be prepared for any C standard
compliant compiler, which is a greater challenge.
I'll make the conversion to using hex constants. I'm wondering
if it would be beneficial to retain the decimal constants in comment form?
Maybe not for the big table, but for the constants used for
specific thresholds, etc.  I know when I first looked at
some of this code, I wrote a little program to display the
decimal equivalents of some of the hex constants to better understand
what the control flow was doing.

- patrick
Patrick McGehearty - Oct. 20, 2017, 3:04 p.m.
On 10/19/2017 5:48 PM, Joseph Myers wrote:
> On Thu, 19 Oct 2017, Patrick McGehearty wrote:
>
>> For the copyright issue, would it be appropriate to move the
>> previous copyright notice to right before __exp1?
>> I'm reluctant to move __exp1 as that might also require changes
>> to Makefiles which are not currently required.
> There should be exactly one copyright notice with a given copyright holder
> in a given source file, so only one set of dates (2001-2017) given.  The
> "IBM Accurate Mathematical Library" comment might be updated to describe
> what parts of the file come from where.
I'll see what I can do.
>> For slowexp, I have some reservations that removing slowexp.o
>> might cause older object modules to break due to the missing
>> reference. I know they should not be directly referencing
>> an internal function, but still...
>> Anyway, I can't find any other direct usage of __slowexp.
>> I will remove all references to __slowexp and see
>> if anything breaks to show I missed a reference.
>>
>> I find the following files have references, including some more
>> files to be removed.
> This list seems to be missing (at least)
> sysdeps/powerpc/power4/fpu/Makefile (CPPFLAGS-slowexp.c setting).
Thanks. I missed that in my grep review.
>> benchtests/strcol1-inputs/filelist#C and filelist#en_US.UTF-8
>> have references to slowexp*.c
>> I'm supposing those should also be removed.
> I know some people do edit this benchmark input when removing files, but I
> think that's inappropriate; it's just a test input that happens to be
> based on a list of files in some version of glibc, there is no need for it
> to correspond to current glibc, and in the interests of comparability of
> benchmark results it would be best for it never to change.
Ok, I'll leave those alone.
>> I've been uncomfortable with hex floats approach
>> as it only works for ieee754 representations
>> that use base 2. I admit that is most current machines.
> This file is in a directory for such a binary format, and it's tuned for
> that particular format (regarding polynomial size chosen, etc.).  Any code
> written for decimal formats would naturally use decimal constants, but in
> code written for binary formats, hex constants are appropriate for such
> precomputed constants to make clear exactly what value / representation is
> intended.  (And in the 0x1p-54 case, it makes the code a lot more readable
> to put that in hex.)
As noted in other email, I'll do the hex conversion.
>> Underneath the definitions of SET_RESTORE_ROUND, it ultimately relies
>> on __fegetround() and __fesetround(). The extra cost for
>> SET_RESTORE_ROUND is that it requires a flag to always be set (mode
>> changed or did not change). A short time the flag must tested to
>> determine if the rounding mode needs to be restored. If the compiler
>> puts that flag on the stack or in memory, the reading of a value that
>> was just written to cache triggers a "Read after Write" HW hazard,
>> causing a typical delay of 30-40 cycles. Avoiding the test also
>> avoids a possible mis-predicted branch.  For M7 and earlier, Sparc
>> branch prediction is not ideal and mis-prediction is expensive.  The
>> code I provided avoids the need to set the flag or test it by
>> duplicating small segments of code for each case.
> Well, there are lots of lower-level interfaces such as libc_fesetround
> that could be used if appropriate.  Maybe additional such interfaces need
> to be added to support such cases of duplicating code.  It would seem
> natural to start with the existing interfaces (SET_RESTORE_ROUND), with a
> view to a followup possibly then adding further optimizations, just as the
> expf changes started by adding something wrapped by the existing wrapper,
> then arranged to avoid a wrapper in subsequent patches in the series.
I'll test Wilco's suggestion of using get_rounding_mode and libc_fesetround.
>>> Guaranteeing "inexact" is not part of the goals for most libm functions,
>>> so I expect you can remove that term.
>> The "inexact" test was required to pass the (make check) math lib tests.
> You'll need to explain more.  For functions which are not fully defined by
> a binding to IEEE operations, both spurious and missing "inexact" should
> be allowed by the testsuite.
>
It was months ago that I made the inexact modifications. It was part
of dealing with failure to report underflow/overflow/NAN conditions.
I've forgotten the exact details.
I'll remove the inexact code and report what test failures are seen if any.

- patrick
Joseph Myers - Oct. 20, 2017, 4:10 p.m.
On Fri, 20 Oct 2017, Szabolcs Nagy wrote:

> decimal floats are not even required to be correctly rounded by
> the compiler in iso c, they are only faithfully rounded, so
> this is a portability bug in the original source too, you can
> silently get completely wrong code generation because of it.

GCC 4.9 (the minimum version for building glibc) and later correctly round 
decimal floating-point constants for binary formats (see GCC bug 21718).  
Before that we did at least once have an issue with a decimal constant 
where someone had computed a value exactly half way between two 
representable values and different compilers rounded it differently 
(glibc bug 14803).
Patrick McGehearty - Oct. 21, 2017, 5:23 a.m.
On 10/19/2017 5:48 PM, Joseph Myers wrote:
> On Thu, 19 Oct 2017, Patrick McGehearty wrote:
>
>
>>> Guaranteeing "inexact" is not part of the goals for most libm functions,
>>> so I expect you can remove that term.
>> The "inexact" test was required to pass the (make check) math lib tests.
> You'll need to explain more.  For functions which are not fully defined by
> a binding to IEEE operations, both spurious and missing "inexact" should
> be allowed by the testsuite.
>
When the following lines are commented out:
      double force_underflow = tiny * tiny;
      math_force_eval (force_underflow);

The following failures appear (plus repeated for other rounding modes)
Failure: exp (-0x2.c4edp+12): Exception "Underflow" not set
Failure: exp (-0x2.c5b2319c4843ap+12): Exception "Underflow" not set
Failure: exp (-0x2.c5b2319c4843cp+12): Exception "Underflow" not set
Failure: exp (-0x2.c5b234p+12): Exception "Underflow" not set
Failure: exp (-0x2.c5b23p+12): Exception "Underflow" not set
Failure: exp (-0x2.c5bd48bdc7c0cp+12): Exception "Underflow" not set
Failure: exp (-0x2.c5bd48bdc7c0ep+12): Exception "Underflow" not set
Failure: exp (-0x2.c5bd48p+12): Exception "Underflow" not set
Failure: exp (-0x2.c5bd4cp+12): Exception "Underflow" not set
Failure: exp (-0x2.ebe224p+8): Exception "Underflow" not set
Failure: exp (-0x2.ebe227861639p+8): Exception "Underflow" not set
Failure: exp (-0x2.ebe228p+8): Exception "Underflow" not set
Failure: exp (-0x4.d2p+8): Exception "Underflow" not set
Failure: exp (-0xf.ffffffffffff8p+1020): Exception "Underflow" not set
Failure: exp (-0xf.fffffp+124): Exception "Underflow" not set

The same pair of instructions are used in the old e_exp.c
to set the underflow exception, which means the new code
will match the behavior of the old code for underflows.

- patrick
Siddhesh Poyarekar - Oct. 23, 2017, 12:25 p.m.
On Monday 16 October 2017 10:26 PM, Patrick McGehearty wrote:
> The extreme max times for the old (ieee754) exp are due to the
> multiprecision computation in the old algorithm when the true value is
> very near 0.5 ulp away from an value representable in double
> precision. The new algorithm does not take special measures for those
> cases. The current glibc exp perf tests overrepresent those values.
> Informal testing suggests approximately one in 200 cases might
> invoke the high cost computation. The performance advantage of the new
> algorithm for other values is still large but not as large as indicated
> by the chart above.

The inputs were curated such that the multiple precision ones were
weeded out into separate tests to avoid seeing such deviations.  This
was based on the premise that the result precision for these inputs
would be consistent (not necessarily the same) across platforms but that
doesn't seem to be the case and some inputs seem to have sneaked in.

On a related note, if we are comfortable dropping exp slow path, we
should probably take a serious look at the log slow path too since IIRC
I wasn't even able to trigger it after days of running the test and it's
quite possible that nobody cares.  We could drop it and see if anybody
notices.

Siddhesh
Joseph Myers - Oct. 23, 2017, 12:47 p.m.
On Sat, 21 Oct 2017, Patrick McGehearty wrote:

> On 10/19/2017 5:48 PM, Joseph Myers wrote:
> > On Thu, 19 Oct 2017, Patrick McGehearty wrote:
> > 
> > 
> > > > Guaranteeing "inexact" is not part of the goals for most libm functions,
> > > > so I expect you can remove that term.
> > > The "inexact" test was required to pass the (make check) math lib tests.
> > You'll need to explain more.  For functions which are not fully defined by
> > a binding to IEEE operations, both spurious and missing "inexact" should
> > be allowed by the testsuite.
> > 
> When the following lines are commented out:
>      double force_underflow = tiny * tiny;
>      math_force_eval (force_underflow);

That's not what I was referring to.  I was referring to the comment I 
quoted, '/* the "small" term below guarantees inexact will be raised */', 
and the associated uses of the constant "small".  Inexact, not underflow.
Joseph Myers - Oct. 23, 2017, 3:58 p.m.
On Mon, 23 Oct 2017, Siddhesh Poyarekar wrote:

> On a related note, if we are comfortable dropping exp slow path, we
> should probably take a serious look at the log slow path too since IIRC
> I wasn't even able to trigger it after days of running the test and it's
> quite possible that nobody cares.  We could drop it and see if anybody
> notices.

The basis for removing a slow path in an existing implementation should be 
an error analysis of the remaining code that justifies that the slow path 
is never needed to avoid large errors.  That might be an error analysis 
that assumes the correctness of the existing code and deduces an error 
bound on the basis that larger errors would make a check for being 
correctly rounded (probably the one that gates entering the slow path in 
question) incorrect.
Patrick McGehearty - Oct. 23, 2017, 7:58 p.m.
On 10/23/2017 7:47 AM, Joseph Myers wrote:
> On Sat, 21 Oct 2017, Patrick McGehearty wrote:
>
>> On 10/19/2017 5:48 PM, Joseph Myers wrote:
>>> On Thu, 19 Oct 2017, Patrick McGehearty wrote:
>>>
>>>
>>>>> Guaranteeing "inexact" is not part of the goals for most libm functions,
>>>>> so I expect you can remove that term.
>>>> The "inexact" test was required to pass the (make check) math lib tests.
>>> You'll need to explain more.  For functions which are not fully defined by
>>> a binding to IEEE operations, both spurious and missing "inexact" should
>>> be allowed by the testsuite.
>>>
>> When the following lines are commented out:
>>       double force_underflow = tiny * tiny;
>>       math_force_eval (force_underflow);
> That's not what I was referring to.  I was referring to the comment I
> quoted, '/* the "small" term below guarantees inexact will be raised */',
> and the associated uses of the constant "small".  Inexact, not underflow.
>
Now I understand your point and researched the source of the comment and
reason for the use of "small". By my reading of the ieee754 definition 
of "inexact",
exp(x) for any x except 0.0 should set the inexact bit. The Studio version
of exp() included "small" for that reason.
The current Linux ieee754 e_exp() makes no attempt to insure inexact is set.
I can easily match that behavior without changing accuracy by removing all
uses of "small" (and associated comments).

I will do that for my next patch submission.

- patrick
Joseph Myers - Oct. 23, 2017, 9:31 p.m.
On Mon, 23 Oct 2017, Patrick McGehearty wrote:

> Now I understand your point and researched the source of the comment and 
> reason for the use of "small". By my reading of the ieee754 definition 
> of "inexact", exp(x) for any x except 0.0 should set the inexact bit.

However, while that would apply for TS 18661-4 crexp (corresponding 
directly to the IEEE 754 exp operation), glibc's accuracy goals for 
functions not bound to IEEE operations are as documented in math.texi, and 
those do not include correctness in whether "inexact" is raised.  (There 
is some existing code in glibc to set "inexact" in cases where it's not 
necessary to do so.  Removing such code would be reasonable cleanups, but 
is independent of this exp patch.)

Patch

diff --git a/sysdeps/ieee754/dbl-64/e_exp.c b/sysdeps/ieee754/dbl-64/e_exp.c
index 6757a14..555a47f 100644
--- a/sysdeps/ieee754/dbl-64/e_exp.c
+++ b/sysdeps/ieee754/dbl-64/e_exp.c
@@ -1,238 +1,452 @@ 
+/* EXP function - Compute double precision exponential
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
 /*
- * IBM Accurate Mathematical Library
- * written by International Business Machines Corp.
- * Copyright (C) 2001-2017 Free Software Foundation, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
+   exp(x)
+   Hybrid algorithm of Peter Tang's Table driven method (for large
+   arguments) and an accurate table (for small arguments).
+   Written by K.C. Ng, November 1988.
+   Method (large arguments):
+	1. Argument Reduction: given the input x, find r and integer k
+	   and j such that
+	             x = (k+j/32)*(ln2) + r,  |r| <= (1/64)*ln2
+
+	2. exp(x) = 2^k * (2^(j/32) + 2^(j/32)*expm1(r))
+	   a. expm1(r) is approximated by a polynomial:
+	      expm1(r) ~ r + t1*r^2 + t2*r^3 + ... + t5*r^6
+	      Here t1 = 1/2 exactly.
+	   b. 2^(j/32) is represented to twice double precision
+	      as TBL[2j]+TBL[2j+1].
+
+   Note: If divide were fast enough, we could use another approximation
+	 in 2.a:
+	      expm1(r) ~ (2r)/(2-R), R = r - r^2*(t1 + t2*r^2)
+	      (for the same t1 and t2 as above)
+
+   Special cases:
+	exp(INF) is INF, exp(NaN) is NaN;
+	exp(-INF)=  0;
+	for finite argument, only exp(0)=1 is exact.
+
+   Accuracy:
+	According to an error analysis, the error is always less than
+	an ulp (unit in the last place).  The largest errors observed
+	are less than 0.55 ulp for normal results and less than 0.75 ulp
+	for subnormal results.
+
+   Misc. info.
+	For IEEE double
+		if x >  7.09782712893383973096e+02 then exp(x) overflow
+		if x < -7.45133219101941108420e+02 then exp(x) underflow
  */
-/***************************************************************************/
-/*  MODULE_NAME:uexp.c                                                     */
-/*                                                                         */
-/*  FUNCTION:uexp                                                          */
-/*           exp1                                                          */
-/*                                                                         */
-/* FILES NEEDED:dla.h endian.h mpa.h mydefs.h uexp.h                       */
-/*              mpa.c mpexp.x slowexp.c                                    */
-/*                                                                         */
-/* An ultimate exp routine. Given an IEEE double machine number x          */
-/* it computes the correctly rounded (to nearest) value of e^x             */
-/* Assumption: Machine arithmetic operations are performed in              */
-/* round to nearest mode of IEEE 754 standard.                             */
-/*                                                                         */
-/***************************************************************************/
 
 #include <math.h>
+#include <math-svid-compat.h>
+#include <math_private.h>
+#include <errno.h>
 #include "endian.h"
 #include "uexp.h"
+#include "uexp.tbl"
 #include "mydefs.h"
 #include "MathLib.h"
-#include "uexp.tbl"
-#include <math_private.h>
 #include <fenv.h>
 #include <float.h>
 
-#ifndef SECTION
-# define SECTION
-#endif
+extern double __ieee754_exp (double);
+
+
+static const double TBL[] = {
+  1.00000000000000000000e+00, 0.00000000000000000000e+00,
+  1.02189714865411662714e+00, 5.10922502897344389359e-17,
+  1.04427378242741375480e+00, 8.55188970553796365958e-17,
+  1.06714040067682369717e+00, -7.89985396684158212226e-17,
+  1.09050773266525768967e+00, -3.04678207981247114697e-17,
+  1.11438674259589243221e+00, 1.04102784568455709549e-16,
+  1.13878863475669156458e+00, 8.91281267602540777782e-17,
+  1.16372485877757747552e+00, 3.82920483692409349872e-17,
+  1.18920711500272102690e+00, 3.98201523146564611098e-17,
+  1.21524735998046895524e+00, -7.71263069268148813091e-17,
+  1.24185781207348400201e+00, 4.65802759183693679123e-17,
+  1.26905095719173321989e+00, 2.66793213134218609523e-18,
+  1.29683955465100964055e+00, 2.53825027948883149593e-17,
+  1.32523664315974132322e+00, -2.85873121003886075697e-17,
+  1.35425554693689265129e+00, 7.70094837980298946162e-17,
+  1.38390988196383202258e+00, -6.77051165879478628716e-17,
+  1.41421356237309514547e+00, -9.66729331345291345105e-17,
+  1.44518080697704665027e+00, -3.02375813499398731940e-17,
+  1.47682614593949934623e+00, -3.48399455689279579579e-17,
+  1.50916442759342284141e+00, -1.01645532775429503911e-16,
+  1.54221082540794074411e+00, 7.94983480969762085616e-17,
+  1.57598084510788649659e+00, -1.01369164712783039808e-17,
+  1.61049033194925428347e+00, 2.47071925697978878522e-17,
+  1.64575547815396494578e+00, -1.01256799136747726038e-16,
+  1.68179283050742900407e+00, 8.19901002058149652013e-17,
+  1.71861929812247793414e+00, -1.85138041826311098821e-17,
+  1.75625216037329945351e+00, 2.96014069544887330703e-17,
+  1.79470907500310716820e+00, 1.82274584279120867698e-17,
+  1.83400808640934243066e+00, 3.28310722424562658722e-17,
+  1.87416763411029996256e+00, -6.12276341300414256164e-17,
+  1.91520656139714740007e+00, -1.06199460561959626376e-16,
+  1.95714412417540017941e+00, 8.96076779103666776760e-17,
+};
+
+/*
+   For i = 0, ..., 66,
+     TBL2[2*i] is a double precision number near (i+1)*2^-6, and
+     TBL2[2*i+1] = exp(TBL2[2*i]) to within a relative error less
+     than 2^-60.
+
+   For i = 67, ..., 133,
+     TBL2[2*i] is a double precision number near -(i+1)*2^-6, and
+     TBL2[2*i+1] = exp(TBL2[2*i]) to within a relative error less
+     than 2^-60.
+*/
+static const double TBL2[] = {
+  1.56249999999984491572e-02, 1.01574770858668417262e+00,
+  3.12499999999998716305e-02, 1.03174340749910253834e+00,
+  4.68750000000011102230e-02, 1.04799100201663386578e+00,
+  6.24999999999990632493e-02, 1.06449445891785843266e+00,
+  7.81249999999999444888e-02, 1.08125780744903954300e+00,
+  9.37500000000013322676e-02, 1.09828514030782731226e+00,
+  1.09375000000001346145e-01, 1.11558061464248226002e+00,
+  1.24999999999999417133e-01, 1.13314845306682565607e+00,
+  1.40624999999995337063e-01, 1.15099294469117108264e+00,
+  1.56249999999996141975e-01, 1.16911844616949989195e+00,
+  1.71874999999992894573e-01, 1.18752938276309216725e+00,
+  1.87500000000000888178e-01, 1.20623024942098178158e+00,
+  2.03124999999361649516e-01, 1.22522561187652545556e+00,
+  2.18750000000000416334e-01, 1.24452010776609567344e+00,
+  2.34375000000003524958e-01, 1.26411844775347081971e+00,
+  2.50000000000006328271e-01, 1.28402541668774961003e+00,
+  2.65624999999982791543e-01, 1.30424587476761533189e+00,
+  2.81249999999993727240e-01, 1.32478475872885725906e+00,
+  2.96875000000003275158e-01, 1.34564708304941493822e+00,
+  3.12500000000002886580e-01, 1.36683794117380030819e+00,
+  3.28124999999993394173e-01, 1.38836250675661765364e+00,
+  3.43749999999998612221e-01, 1.41022603492570874906e+00,
+  3.59374999999992450483e-01, 1.43243386356506730017e+00,
+  3.74999999999991395772e-01, 1.45499141461818881638e+00,
+  3.90624999999997613020e-01, 1.47790419541173490003e+00,
+  4.06249999999991895372e-01, 1.50117780000011058483e+00,
+  4.21874999999996613820e-01, 1.52481791053132154090e+00,
+  4.37500000000004607426e-01, 1.54883029863414023453e+00,
+  4.53125000000004274359e-01, 1.57322082682725961078e+00,
+  4.68750000000008326673e-01, 1.59799544995064657371e+00,
+  4.84374999999985456078e-01, 1.62316021661928200359e+00,
+  4.99999999999997335465e-01, 1.64872127070012375327e+00,
+  5.15625000000000222045e-01, 1.67468485281178436352e+00,
+  5.31250000000003441691e-01, 1.70105730184840653330e+00,
+  5.46874999999999111822e-01, 1.72784505652716169344e+00,
+  5.62499999999999333866e-01, 1.75505465696029738787e+00,
+  5.78124999999993338662e-01, 1.78269274625180318417e+00,
+  5.93749999999999666933e-01, 1.81076607211938656050e+00,
+  6.09375000000003441691e-01, 1.83928148854178719063e+00,
+  6.24999999999995559108e-01, 1.86824595743221411048e+00,
+  6.40625000000009103829e-01, 1.89766655033813602671e+00,
+  6.56249999999993782751e-01, 1.92755045016753268072e+00,
+  6.71875000000002109424e-01, 1.95790495294292221651e+00,
+  6.87499999999992450483e-01, 1.98873746958227681780e+00,
+  7.03125000000004996004e-01, 2.02005552770870666635e+00,
+  7.18750000000007105427e-01, 2.05186677348799140219e+00,
+  7.34375000000008770762e-01, 2.08417897349558689513e+00,
+  7.49999999999983901766e-01, 2.11700001661264058939e+00,
+  7.65624999999997002398e-01, 2.15033791595229351046e+00,
+  7.81250000000005884182e-01, 2.18420081081563077774e+00,
+  7.96874999999991451283e-01, 2.21859696867912603579e+00,
+  8.12500000000000000000e-01, 2.25353478721320854561e+00,
+  8.28125000000008215650e-01, 2.28902279633221983346e+00,
+  8.43749999999997890576e-01, 2.32506966027711614586e+00,
+  8.59374999999999444888e-01, 2.36168417973090827289e+00,
+  8.75000000000003219647e-01, 2.39887529396710563745e+00,
+  8.90625000000013433699e-01, 2.43665208303232461162e+00,
+  9.06249999999980571097e-01, 2.47502376996297712708e+00,
+  9.21874999999984456878e-01, 2.51399972303748420188e+00,
+  9.37500000000001887379e-01, 2.55358945806293169412e+00,
+  9.53125000000003330669e-01, 2.59380264069854327147e+00,
+  9.68749999999989119814e-01, 2.63464908881560244680e+00,
+  9.84374999999997890576e-01, 2.67613877489447116176e+00,
+  1.00000000000001154632e+00, 2.71828182845907662113e+00,
+  1.01562499999999333866e+00, 2.76108853855008318234e+00,
+  1.03124999999995980993e+00, 2.80456935623711389738e+00,
+  1.04687499999999933387e+00, 2.84873489717039740654e+00,
+  -1.56249999999999514277e-02, 9.84496437005408453480e-01,
+  -3.12499999999955972718e-02, 9.69233234476348348707e-01,
+  -4.68749999999993824384e-02, 9.54206665969188905230e-01,
+  -6.24999999999976130205e-02, 9.39413062813478028090e-01,
+  -7.81249999999989314103e-02, 9.24848813216205822840e-01,
+  -9.37499999999995975442e-02, 9.10510361380034494161e-01,
+  -1.09374999999998584466e-01, 8.96394206635151680196e-01,
+  -1.24999999999998556710e-01, 8.82496902584596676355e-01,
+  -1.40624999999999361622e-01, 8.68815056262843721235e-01,
+  -1.56249999999999111822e-01, 8.55345327307423297647e-01,
+  -1.71874999999924144012e-01, 8.42084427143446223596e-01,
+  -1.87499999999996752598e-01, 8.29029118180403035154e-01,
+  -2.03124999999988037347e-01, 8.16176213022349550386e-01,
+  -2.18749999999995947686e-01, 8.03522573689063990265e-01,
+  -2.34374999999996419531e-01, 7.91065110850298847112e-01,
+  -2.49999999999996280753e-01, 7.78800783071407765057e-01,
+  -2.65624999999999888978e-01, 7.66726596070820165529e-01,
+  -2.81249999999989397370e-01, 7.54839601989015340777e-01,
+  -2.96874999999996114219e-01, 7.43136898668761203268e-01,
+  -3.12499999999999555911e-01, 7.31615628946642115871e-01,
+  -3.28124999999993782751e-01, 7.20272979955444259126e-01,
+  -3.43749999999997946087e-01, 7.09106182437399867879e-01,
+  -3.59374999999994337863e-01, 6.98112510068129799023e-01,
+  -3.74999999999994615418e-01, 6.87289278790975899369e-01,
+  -3.90624999999999000799e-01, 6.76633846161729612945e-01,
+  -4.06249999999947264406e-01, 6.66143610703522903727e-01,
+  -4.21874999999988453681e-01, 6.55816011271509125002e-01,
+  -4.37499999999999111822e-01, 6.45648526427892610613e-01,
+  -4.53124999999999278355e-01, 6.35638673826052436056e-01,
+  -4.68749999999999278355e-01, 6.25784009604591573428e-01,
+  -4.84374999999992894573e-01, 6.16082127790682609891e-01,
+  -4.99999999999998168132e-01, 6.06530659712634534486e-01,
+  -5.15625000000000000000e-01, 5.97127273421627413619e-01,
+  -5.31249999999989785948e-01, 5.87869673122352498496e-01,
+  -5.46874999999972688514e-01, 5.78755598612500032907e-01,
+  -5.62500000000000000000e-01, 5.69782824730923009859e-01,
+  -5.78124999999992339461e-01, 5.60949160814475100700e-01,
+  -5.93749999999948707696e-01, 5.52252450163048691500e-01,
+  -6.09374999999552580121e-01, 5.43690569513243682209e-01,
+  -6.24999999999984789945e-01, 5.35261428518998383375e-01,
+  -6.40624999999983457677e-01, 5.26962969243379708573e-01,
+  -6.56249999999998334665e-01, 5.18793165653890220312e-01,
+  -6.71874999999943378626e-01, 5.10750023129039609771e-01,
+  -6.87499999999997002398e-01, 5.02831577970942467104e-01,
+  -7.03124999999991118216e-01, 4.95035896926202978463e-01,
+  -7.18749999999991340260e-01, 4.87361076713623331269e-01,
+  -7.34374999999985678123e-01, 4.79805243559684402310e-01,
+  -7.49999999999997335465e-01, 4.72366552741015965911e-01,
+  -7.65624999999993782751e-01, 4.65043188134059204408e-01,
+  -7.81249999999863220523e-01, 4.57833361771676883301e-01,
+  -7.96874999999998112621e-01, 4.50735313406363247157e-01,
+  -8.12499999999990119015e-01, 4.43747310081084256339e-01,
+  -8.28124999999996003197e-01, 4.36867645705559026759e-01,
+  -8.43749999999988120614e-01, 4.30094640640067360504e-01,
+  -8.59374999999994115818e-01, 4.23426641285265303871e-01,
+  -8.74999999999977129406e-01, 4.16862019678517936594e-01,
+  -8.90624999999983346655e-01, 4.10399173096376801428e-01,
+  -9.06249999999991784350e-01, 4.04036523663345414903e-01,
+  -9.21874999999994004796e-01, 3.97772517966614058693e-01,
+  -9.37499999999994337863e-01, 3.91605626676801210628e-01,
+  -9.53124999999999444888e-01, 3.85534344174578935682e-01,
+  -9.68749999999986677324e-01, 3.79557188183094640355e-01,
+  -9.84374999999992339461e-01, 3.73672699406045860648e-01,
+  -9.99999999999995892175e-01, 3.67879441171443832825e-01,
+  -1.01562499999994315658e+00, 3.62175999080846300338e-01,
+  -1.03124999999991096011e+00, 3.56560980663978732697e-01,
+  -1.04687499999999067413e+00, 3.51033015038813400732e-01,
+};
+
+static const double
+  half		=0.5,
+/* Following three values used to scale x to primary range */
+  invln2_32	=4.61662413084468283841e+01,	/* 0x40471547, 0x652b82fe */
+  ln2_32hi	=2.16608493865351192653e-02,	/* 0x3f962e42, 0xfee00000 */
+  ln2_32lo	=5.96317165397058656257e-12,	/* 0x3d9a39ef, 0x35793c76 */
+/* t2-t5 terms used for polynomial computation */
+  t2		=1.6666666666526086527e-1,	/* 3fc5555555548f7c */
+  t3		=4.1666666666226079285e-2,	/* 3fa5555555545d4e */
+  t4		=8.3333679843421958056e-3,	/* 3f811115b7aa905e */
+  t5		=1.3888949086377719040e-3,	/* 3f56c1728d739765 */
+  one		=1.0,
+/* maximum value for x to not overflow */
+  threshold1	=7.09782712893383973096e+02,	/* 0x40862E42, 0xFEFA39EF */
+/* maximum value for -x to not underflow */
+  threshold2	=7.45133219101941108420e+02,	/* 0x40874910, 0xD52D3051 */
+/* scaling factor used when result near zero*/
+  twom54	=5.55111512312578270212e-17,	/* 0x3c900000, 0x00000000 */
+/* value used to force inexact condition */
+  small		=1.0e-100;
 
-double __slowexp (double);
 
-/* An ultimate exp routine. Given an IEEE double machine number x it computes
-   the correctly rounded (to nearest) value of e^x.  */
 double
-SECTION
-__ieee754_exp (double x)
+__ieee754_exp (double x_arg)
 {
-  double bexp, t, eps, del, base, y, al, bet, res, rem, cor;
-  mynumber junk1, junk2, binexp = {{0, 0}};
-  int4 i, j, m, n, ex;
+  double z, t;
   double retval;
-
+  int hx, ix, k, j, m;
+  int fe_val;
+  union
   {
-    SET_RESTORE_ROUND (FE_TONEAREST);
-
-    junk1.x = x;
-    m = junk1.i[HIGH_HALF];
-    n = m & hugeint;
-
-    if (n > smallint && n < bigint)
-      {
-	y = x * log2e.x + three51.x;
-	bexp = y - three51.x;	/*  multiply the result by 2**bexp        */
-
-	junk1.x = y;
-
-	eps = bexp * ln_two2.x;	/* x = bexp*ln(2) + t - eps               */
-	t = x - bexp * ln_two1.x;
-
-	y = t + three33.x;
-	base = y - three33.x;	/* t rounded to a multiple of 2**-18      */
-	junk2.x = y;
-	del = (t - base) - eps;	/*  x = bexp*ln(2) + base + del           */
-	eps = del + del * del * (p3.x * del + p2.x);
-
-	binexp.i[HIGH_HALF] = (junk1.i[LOW_HALF] + 1023) << 20;
-
-	i = ((junk2.i[LOW_HALF] >> 8) & 0xfffffffe) + 356;
-	j = (junk2.i[LOW_HALF] & 511) << 1;
-
-	al = coar.x[i] * fine.x[j];
-	bet = ((coar.x[i] * fine.x[j + 1] + coar.x[i + 1] * fine.x[j])
-	       + coar.x[i + 1] * fine.x[j + 1]);
-
-	rem = (bet + bet * eps) + al * eps;
-	res = al + rem;
-	cor = (al - res) + rem;
-	if (res == (res + cor * err_0))
-	  {
-	    retval = res * binexp.x;
-	    goto ret;
+    int i_part[2];
+    double x;
+  } xx;
+  union
+  {
+    int y_part[2];
+    double y;
+  } yy;
+  xx.x = x_arg;
+
+  ix = xx.i_part[HIGH_HALF];
+  hx = ix & ~0x80000000;
+
+  if (hx < 0x3ff0a2b2)
+    {				/* |x| < 3/2 ln 2 */
+      if (hx < 0x3f862e42)
+	{			/* |x| < 1/64 ln 2 */
+	  if (hx < 0x3ed00000)
+	    {			/* |x| < 2^-18 */
+	      /* raise inexact if x != 0 */
+	      if (hx < 0x3e300000)
+		{
+		  retval = one + xx.x;
+		  return (retval);
+		}
+	      retval = one + xx.x * (one + half * xx.x);
+	      return (retval);
+	    }
+	  /* 
+	     Use FE_TONEAREST rounding mode for computing yy.y 
+	     Avoid set/reset of rounding mode if already in FE_TONEAREST mode
+	  */
+	  fe_val = __fegetround();
+	  if (fe_val == FE_TONEAREST) {
+	    t = xx.x * xx.x;
+	    yy.y = xx.x + (t * (half + xx.x * t2) +
+			   (t * t) * (t3 + xx.x * t4 + t * t5));
+	    retval = one + yy.y;
+	  } else {
+	    __fesetround(FE_TONEAREST);
+	    t = xx.x * xx.x;
+	    yy.y = xx.x + (t * (half + xx.x * t2) +
+			   (t * t) * (t3 + xx.x * t4 + t * t5));
+	    retval = one + yy.y;
+	    __fesetround(fe_val);
 	  }
-	else
-	  {
-	    retval = __slowexp (x);
-	    goto ret;
-	  }			/*if error is over bound */
-      }
+	  return (retval);
+	}
 
-    if (n <= smallint)
-      {
-	retval = 1.0;
-	goto ret;
+      /* find the multiple of 2^-6 nearest x */
+      k = hx >> 20;
+      j = (0x00100000 | (hx & 0x000fffff)) >> (0x40c - k);
+      j = (j - 1) & ~1;
+      if (ix < 0)
+	j += 134;
+      /* 
+	 Use FE_TONEAREST rounding mode for computing yy.y 
+	 Avoid set/reset of rounding mode if already in FE_TONEAREST mode
+      */
+      fe_val = __fegetround();
+      if (fe_val == FE_TONEAREST) {
+	z = xx.x - TBL2[j];
+	t = z * z;
+	/* the "small" term below guarantees inexact will be raised */
+	yy.y = z + (t * (half + (z * t2 + small)) +
+		    (t * t) * (t3 + z * t4 + t * t5));
+	retval = TBL2[j + 1] + TBL2[j + 1] * yy.y;
+      } else {
+	__fesetround(FE_TONEAREST);
+	z = xx.x - TBL2[j];
+	t = z * z;
+	/* the "small" term below guarantees inexact will be raised */
+	yy.y = z + (t * (half + (z * t2 + small)) +
+		    (t * t) * (t3 + z * t4 + t * t5));
+	retval = TBL2[j + 1] + TBL2[j + 1] * yy.y;
+	__fesetround(fe_val);
       }
+      return (retval);
+    }
 
-    if (n >= badint)
-      {
-	if (n > infint)
-	  {
-	    retval = x + x;
-	    goto ret;
-	  }			/* x is NaN */
-	if (n < infint)
-	  {
-	    if (x > 0)
-	      goto ret_huge;
-	    else
-	      goto ret_tiny;
-	  }
-	/* x is finite,  cause either overflow or underflow  */
-	if (junk1.i[LOW_HALF] != 0)
-	  {
-	    retval = x + x;
-	    goto ret;
-	  }			/*  x is NaN  */
-	retval = (x > 0) ? inf.x : zero;	/* |x| = inf;  return either inf or 0 */
-	goto ret;
-      }
+  if (hx >= 0x40862e42)
+    {				/* x is large, infinite, or nan */
+      if (hx >= 0x7ff00000)
+	{
+	  if (ix == 0xfff00000 && xx.i_part[LOW_HALF] == 0)
+	    return (zero);	/* exp(-inf) = 0 */
+	  return (xx.x * xx.x);	/* exp(nan/inf) is nan or inf */
+	}
+      if (xx.x > threshold1)
+	{			/* set overflow error condition */
+	  retval = hhuge * hhuge;
+	  return retval;
+	} 
+      if (-xx.x > threshold2)
+	{			/* set underflow error condition */
+	  double force_underflow = tiny * tiny;
+	  math_force_eval (force_underflow);
+	  retval = zero;
+	  return retval;
+	}
+    }
 
-    y = x * log2e.x + three51.x;
-    bexp = y - three51.x;
-    junk1.x = y;
-    eps = bexp * ln_two2.x;
-    t = x - bexp * ln_two1.x;
-    y = t + three33.x;
-    base = y - three33.x;
-    junk2.x = y;
-    del = (t - base) - eps;
-    eps = del + del * del * (p3.x * del + p2.x);
-    i = ((junk2.i[LOW_HALF] >> 8) & 0xfffffffe) + 356;
-    j = (junk2.i[LOW_HALF] & 511) << 1;
-    al = coar.x[i] * fine.x[j];
-    bet = ((coar.x[i] * fine.x[j + 1] + coar.x[i + 1] * fine.x[j])
-	   + coar.x[i + 1] * fine.x[j + 1]);
-    rem = (bet + bet * eps) + al * eps;
-    res = al + rem;
-    cor = (al - res) + rem;
-    if (m >> 31)
-      {
-	ex = junk1.i[LOW_HALF];
-	if (res < 1.0)
-	  {
-	    res += res;
-	    cor += cor;
-	    ex -= 1;
-	  }
-	if (ex >= -1022)
-	  {
-	    binexp.i[HIGH_HALF] = (1023 + ex) << 20;
-	    if (res == (res + cor * err_0))
-	      {
-		retval = res * binexp.x;
-		goto ret;
-	      }
-	    else
-	      {
-		retval = __slowexp (x);
-		goto check_uflow_ret;
-	      }			/*if error is over bound */
-	  }
-	ex = -(1022 + ex);
-	binexp.i[HIGH_HALF] = (1023 - ex) << 20;
-	res *= binexp.x;
-	cor *= binexp.x;
-	eps = 1.0000000001 + err_0 * binexp.x;
-	t = 1.0 + res;
-	y = ((1.0 - t) + res) + cor;
-	res = t + y;
-	cor = (t - res) + y;
-	if (res == (res + eps * cor))
-	  {
-	    binexp.i[HIGH_HALF] = 0x00100000;
-	    retval = (res - 1.0) * binexp.x;
-	    goto check_uflow_ret;
-	  }
-	else
-	  {
-	    retval = __slowexp (x);
-	    goto check_uflow_ret;
-	  }			/*   if error is over bound    */
-      check_uflow_ret:
-	if (retval < DBL_MIN)
-	  {
-	    double force_underflow = tiny * tiny;
-	    math_force_eval (force_underflow);
-	  }
-	if (retval == 0)
-	  goto ret_tiny;
-	goto ret;
-      }
+  /* 
+     Use FE_TONEAREST rounding mode for computing yy.y 
+     Avoid set/reset of rounding mode if already in FE_TONEAREST mode
+  */
+  fe_val = __fegetround();
+  if (fe_val == FE_TONEAREST) {
+    t = invln2_32 * xx.x;
+    if (ix < 0)
+      t -= half;
     else
-      {
-	binexp.i[HIGH_HALF] = (junk1.i[LOW_HALF] + 767) << 20;
-	if (res == (res + cor * err_0))
-	  retval = res * binexp.x * t256.x;
-	else
-	  retval = __slowexp (x);
-	if (isinf (retval))
-	  goto ret_huge;
-	else
-	  goto ret;
-      }
+      t += half;
+    k = (int) t;
+    j = (k & 0x1f) << 1;
+    m = k >> 5;
+    z = (xx.x - k * ln2_32hi) - k * ln2_32lo;
+
+    /* z is now in primary range */
+    t = z * z;
+    yy.y = z + (t * (half + z * t2) + (t * t) * (t3 + z * t4 + t * t5));
+    yy.y = TBL[j] + (TBL[j + 1] + TBL[j] * yy.y);
+  } else {
+    __fesetround(FE_TONEAREST);
+    t = invln2_32 * xx.x;
+    if (ix < 0)
+      t -= half;
+    else
+      t += half;
+    k = (int) t;
+    j = (k & 0x1f) << 1;
+    m = k >> 5;
+    z = (xx.x - k * ln2_32hi) - k * ln2_32lo;
+
+    /* z is now in primary range */
+    t = z * z;
+    yy.y = z + (t * (half + z * t2) + (t * t) * (t3 + z * t4 + t * t5));
+    yy.y = TBL[j] + (TBL[j + 1] + TBL[j] * yy.y);
+    __fesetround(fe_val);
   }
-ret:
-  return retval;
 
- ret_huge:
-  return hhuge * hhuge;
-
- ret_tiny:
-  return tiny * tiny;
+  if (m < -1021)
+    {
+      yy.y_part[HIGH_HALF] += (m + 54) << 20;
+      retval = twom54 * yy.y;
+      if (retval < DBL_MIN)
+	{
+	  double force_underflow = tiny * tiny;
+	  math_force_eval (force_underflow);
+	}
+      return retval;
+    }
+  yy.y_part[HIGH_HALF] += m << 20;
+  return (yy.y);
 }
 #ifndef __ieee754_exp
 strong_alias (__ieee754_exp, __exp_finite)
 #endif
 
+#ifndef SECTION
+# define SECTION
+#endif
+
 /* Compute e^(x+xx).  The routine also receives bound of error of previous
    calculation.  If after computing exp the error exceeds the allowed bounds,
    the routine returns a non-positive number.  Otherwise it returns the