[3/4] Use libc_fe* macros in ldbl-128/e_expl.c.

Message ID 20200325100628.883397-3-stli@linux.ibm.com
State Changes Requested, archived
Headers
Series [1/4] Use libc_fe* macros in ldbl-128/s_nearbyintl.c. |

Commit Message

Stefan Liebler March 25, 2020, 10:06 a.m. UTC
  The calls to feholdexcept, fesetround and fesetenv are replaced
by the libc_fe* macros.
---
 sysdeps/ieee754/ldbl-128/e_expl.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)
  

Comments

Stefan Liebler March 25, 2020, 10:13 a.m. UTC | #1
Unfortunately, this patch is responsible for testfails on x86_64:

math/test-float128-exp.out:
Failure: exp (-0x1p-10000): Exception "Underflow" set
Failure: exp (-0x2p-16384): Exception "Underflow" set
...

math/test-float128-cexp.out:
Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception 
"Underflow" set
Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception 
"Underflow" set
...

I've stepped through "expf128 (0x1p-10000)" in 
sysdeps/ieee754/float128/../ldbl-128/e_expl.c:
151: libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);    // before 
this patch: feholdexcept (&oldenv); fesetround (FE_TONEAREST);
199: x22 = x + x*x*(P1+x*(P2+x*(P3+x*(P4+x*(P5+x*P6)))));
203: libc_fesetenvl (&oldenv);    // before this patch: fesetenv (&oldenv);

During the evaluation of x22 the underflow exception occures while:
<__ieee754_expf128+920>  callq  0x7ffff7f2a7c0 <__multf3>
which calls __sfp_handle_exceptions():
ae8be:       40 f6 c7 10             test   $0x10,%dil
ae8c2:       74 0f                   je     ae8d3 
<__sfp_handle_exceptions+0x73>
ae8c4:       d9 74 24 d8             fnstenv -0x28(%rsp)
ae8c8:       66 83 4c 24 dc 10       orw    $0x10,-0x24(%rsp)
ae8ce:       d9 64 24 d8             fldenv -0x28(%rsp)
ae8d2:       9b                      fwait

According to sysdeps/x86/fpu/fenv_private.h:
#ifdef __x86_64__
# define libc_feholdexcept_setroundf128 libc_feholdexcept_setround_sse
# define libc_fesetenvf128	libc_fesetenv_sse
#else
# define libc_feholdexcept_setroundf128	default_libc_feholdexcept_setround
# define libc_fesetenvf128	default_libc_fesetenv
#endif

// On my machine:
# define STMXCSR "stmxcsr"
# define LDMXCSR "ldmxcsr"

static __always_inline void
libc_feholdexcept_setround_sse (fenv_t *e, int r)
{
   unsigned int mxcsr;
   asm (STMXCSR " %0" : "=m" (*&mxcsr));
   e->__mxcsr = mxcsr;
   mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
   asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr));
}

Whereas the feholdexcept() function is using in
./sysdeps/x86_64/fpu/feholdexcpt.c:
int
__feholdexcept (fenv_t *envp)
{
   unsigned int mxcsr;

   /* Store the environment.  Recall that fnstenv has a side effect of
      masking all exceptions.  Then clear all exceptions.  */
   __asm__ ("fnstenv %0\n\t"
	   "stmxcsr %1\n\t"
	   "fnclex"
	   : "=m" (*envp), "=m" (envp->__mxcsr));

   /* Set the SSE MXCSR register.  */
   mxcsr = (envp->__mxcsr | 0x1f80) & ~0x3f;
   __asm__ ("ldmxcsr %0" : : "m" (*&mxcsr));

   return 0;
}

I assume that the underflow exception keeps active as the pair of 
fnstenv / fldenv is missing if libc_feholdexcept_setroundf128 / 
libc_fesetenvf128 is used instead of feholdexcept, fesetround and fesetenv.
As I'm not familiar with float128 on x86_64, can anybody please help?

Bye,
Stefan

On 3/25/20 11:06 AM, Stefan Liebler wrote:
> The calls to feholdexcept, fesetround and fesetenv are replaced
> by the libc_fe* macros.
> ---
>   sysdeps/ieee754/ldbl-128/e_expl.c | 8 +++++---
>   1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
> index 37c1538c08..104ace1690 100644
> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
> @@ -66,6 +66,7 @@
>   #include <inttypes.h>
>   #include <math-barriers.h>
>   #include <math_private.h>
> +#include <fenv_private.h>
>   #include <math-underflow.h>
>   #include <stdlib.h>
>   #include "t_expl.h"
> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>         union ieee854_long_double ex2_u, scale_u;
>         fenv_t oldenv;
>   
> -      feholdexcept (&oldenv);
>   #ifdef FE_TONEAREST
> -      fesetround (FE_TONEAREST);
> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
> +#else
> +      libc_feholdexceptl (&oldenv);
>   #endif
>   
>         /* Calculate n.  */
> @@ -198,7 +200,7 @@ __ieee754_expl (_Float128 x)
>         math_force_eval (x22);
>   
>         /* Return result.  */
> -      fesetenv (&oldenv);
> +      libc_fesetenvl (&oldenv);
>   
>         result = x22 * ex2_u.d + ex2_u.d;
>   
>
  
Adhemerval Zanella March 25, 2020, 3 p.m. UTC | #2
On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
> Unfortunately, this patch is responsible for testfails on x86_64:
> 
> math/test-float128-exp.out:
> Failure: exp (-0x1p-10000): Exception "Underflow" set
> Failure: exp (-0x2p-16384): Exception "Underflow" set
> ...
> 
> math/test-float128-cexp.out:
> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set

The sysdeps/x86/fpu/fenv_private.h states:

296 #ifdef __x86_64__
297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
298    x86_64, so that must be set for float128 computations.  */
299 # define SET_RESTORE_ROUNDF128(RM) \
300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)

So 

>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>> index 37c1538c08..104ace1690 100644
>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>> @@ -66,6 +66,7 @@
>>   #include <inttypes.h>
>>   #include <math-barriers.h>
>>   #include <math_private.h>
>> +#include <fenv_private.h>
>>   #include <math-underflow.h>
>>   #include <stdlib.h>
>>   #include "t_expl.h"
>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>         union ieee854_long_double ex2_u, scale_u;
>>         fenv_t oldenv;
>>   -      feholdexcept (&oldenv);
>>   #ifdef FE_TONEAREST
>> -      fesetround (FE_TONEAREST);
>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);

Should be libc_feholdexcept_setroundf128.

>> +#else
>> +      libc_feholdexceptl (&oldenv);

And here libc_fesetenvf128.

>>   #endif
>>           /* Calculate n.  */
>> @@ -198,7 +200,7 @@ __ieee754_expl (_Float128 x)
>>         math_force_eval (x22);
>>           /* Return result.  */
>> -      fesetenv (&oldenv);
>> +      libc_fesetenvl (&oldenv);
>>           result = x22 * ex2_u.d + ex2_u.d;

It might require extend the libc_*f128 macros to other architectures
(not sure).
  
Adhemerval Zanella March 25, 2020, 3:07 p.m. UTC | #3
On 25/03/2020 12:00, Adhemerval Zanella wrote:
> 
> 
> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>> Unfortunately, this patch is responsible for testfails on x86_64:
>>
>> math/test-float128-exp.out:
>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>> ...
>>
>> math/test-float128-cexp.out:
>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
> 
> The sysdeps/x86/fpu/fenv_private.h states:
> 
> 296 #ifdef __x86_64__
> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
> 298    x86_64, so that must be set for float128 computations.  */
> 299 # define SET_RESTORE_ROUNDF128(RM) \
> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
> 
> So 
> 
>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>> index 37c1538c08..104ace1690 100644
>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>> @@ -66,6 +66,7 @@
>>>   #include <inttypes.h>
>>>   #include <math-barriers.h>
>>>   #include <math_private.h>
>>> +#include <fenv_private.h>
>>>   #include <math-underflow.h>
>>>   #include <stdlib.h>
>>>   #include "t_expl.h"
>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>         union ieee854_long_double ex2_u, scale_u;
>>>         fenv_t oldenv;
>>>   -      feholdexcept (&oldenv);
>>>   #ifdef FE_TONEAREST
>>> -      fesetround (FE_TONEAREST);
>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
> 
> Should be libc_feholdexcept_setroundf128.

But it does not see to help here, so I don't know what is failing as well.

> 
>>> +#else
>>> +      libc_feholdexceptl (&oldenv);
> 
> And here libc_fesetenvf128.
> 
>>>   #endif
>>>           /* Calculate n.  */
>>> @@ -198,7 +200,7 @@ __ieee754_expl (_Float128 x)
>>>         math_force_eval (x22);
>>>           /* Return result.  */
>>> -      fesetenv (&oldenv);
>>> +      libc_fesetenvl (&oldenv);
>>>           result = x22 * ex2_u.d + ex2_u.d;
> 
> It might require extend the libc_*f128 macros to other architectures
> (not sure).
>
  
Adhemerval Zanella March 25, 2020, 3:42 p.m. UTC | #4
On 25/03/2020 12:07, Adhemerval Zanella wrote:
> 
> 
> On 25/03/2020 12:00, Adhemerval Zanella wrote:
>>
>>
>> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>>> Unfortunately, this patch is responsible for testfails on x86_64:
>>>
>>> math/test-float128-exp.out:
>>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>>> ...
>>>
>>> math/test-float128-cexp.out:
>>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
>>
>> The sysdeps/x86/fpu/fenv_private.h states:
>>
>> 296 #ifdef __x86_64__
>> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>> 298    x86_64, so that must be set for float128 computations.  */
>> 299 # define SET_RESTORE_ROUNDF128(RM) \
>> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
>>
>> So 
>>
>>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>> index 37c1538c08..104ace1690 100644
>>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>> @@ -66,6 +66,7 @@
>>>>   #include <inttypes.h>
>>>>   #include <math-barriers.h>
>>>>   #include <math_private.h>
>>>> +#include <fenv_private.h>
>>>>   #include <math-underflow.h>
>>>>   #include <stdlib.h>
>>>>   #include "t_expl.h"
>>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>>         union ieee854_long_double ex2_u, scale_u;
>>>>         fenv_t oldenv;
>>>>   -      feholdexcept (&oldenv);
>>>>   #ifdef FE_TONEAREST
>>>> -      fesetround (FE_TONEAREST);
>>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
>>
>> Should be libc_feholdexcept_setroundf128.
> 
> But it does not see to help here, so I don't know what is failing as well.

Ok, so what is happening __sfp_handle_exceptions always use 387 exception
mode for FP_EX_OVERFLOW and FP_EX_UNDERFLOW:

config/i386/sfp-exceptions.c

 79   if (_fex & FP_EX_OVERFLOW)
 80     {
 81       struct fenv temp;
 82       asm volatile ("fnstenv\t%0" : "=m" (temp));
 83       temp.__status_word |= FP_EX_OVERFLOW;
 84       asm volatile ("fldenv\t%0" : : "m" (temp));
 85       asm volatile ("fwait");
 86     }
 87   if (_fex & FP_EX_UNDERFLOW)
 88     {
 89       struct fenv temp;
 90       asm volatile ("fnstenv\t%0" : "=m" (temp));
 91       temp.__status_word |= FP_EX_UNDERFLOW;
 92       asm volatile ("fldenv\t%0" : : "m" (temp));
 93       asm volatile ("fwait");
 94     }

Different that FP_EX_INEXACT, for instance, where __SSE_MATH__ sets
whether SSE is used or not.

So I think it is not safe to use the SSE variants for libc_*_testf128,
as for i387 we should use the default_* instead.
  
Stefan Liebler March 26, 2020, 9:08 a.m. UTC | #5
On 3/25/20 4:07 PM, Adhemerval Zanella via Libc-alpha wrote:
> 
> 
> On 25/03/2020 12:00, Adhemerval Zanella wrote:
>>
>>
>> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>>> Unfortunately, this patch is responsible for testfails on x86_64:
>>>
>>> math/test-float128-exp.out:
>>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>>> ...
>>>
>>> math/test-float128-cexp.out:
>>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
>>
>> The sysdeps/x86/fpu/fenv_private.h states:
>>
>> 296 #ifdef __x86_64__
>> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>> 298    x86_64, so that must be set for float128 computations.  */
>> 299 # define SET_RESTORE_ROUNDF128(RM) \
>> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
>>
>> So
>>
>>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>> index 37c1538c08..104ace1690 100644
>>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>> @@ -66,6 +66,7 @@
>>>>    #include <inttypes.h>
>>>>    #include <math-barriers.h>
>>>>    #include <math_private.h>
>>>> +#include <fenv_private.h>
>>>>    #include <math-underflow.h>
>>>>    #include <stdlib.h>
>>>>    #include "t_expl.h"
>>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>>          union ieee854_long_double ex2_u, scale_u;
>>>>          fenv_t oldenv;
>>>>    -      feholdexcept (&oldenv);
>>>>    #ifdef FE_TONEAREST
>>>> -      fesetround (FE_TONEAREST);
>>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
>>
>> Should be libc_feholdexcept_setroundf128.
> 
> But it does not see to help here, so I don't know what is failing as well.
> 
It does not help as this is already the case. The float128 exp is build 
with sysdeps/ieee754/float128/e_expf128.c which includes 
sysdeps/ieee754/float128/float128_private.h before including 
sysdeps/ieee754/ldbl-128/e_expl.c.

float128_private.h contains things like that:
...
#include <fenv_private.h>
...
#ifdef libc_feholdexcept_setroundf128
# undef libc_feholdexcept_setroundl
# define libc_feholdexcept_setroundl(ENV, RM)	\
   libc_feholdexcept_setroundf128 (ENV, RM)
#endif
...
#ifdef libc_fesetenvf128
# undef libc_fesetenvl
# define libc_fesetenvl(ENV) libc_fesetenvf128 (ENV)
#endif

>>
>>>> +#else
>>>> +      libc_feholdexceptl (&oldenv);
>>
>> And here libc_fesetenvf128.
>>
>>>>    #endif
>>>>            /* Calculate n.  */
>>>> @@ -198,7 +200,7 @@ __ieee754_expl (_Float128 x)
>>>>          math_force_eval (x22);
>>>>            /* Return result.  */
>>>> -      fesetenv (&oldenv);
>>>> +      libc_fesetenvl (&oldenv);
>>>>            result = x22 * ex2_u.d + ex2_u.d;
>>
>> It might require extend the libc_*f128 macros to other architectures
>> (not sure).
>>
  
Stefan Liebler March 26, 2020, 9:08 a.m. UTC | #6
On 3/25/20 4:42 PM, Adhemerval Zanella via Libc-alpha wrote:
> 
> 
> On 25/03/2020 12:07, Adhemerval Zanella wrote:
>>
>>
>> On 25/03/2020 12:00, Adhemerval Zanella wrote:
>>>
>>>
>>> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>>>> Unfortunately, this patch is responsible for testfails on x86_64:
>>>>
>>>> math/test-float128-exp.out:
>>>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>>>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>>>> ...
>>>>
>>>> math/test-float128-cexp.out:
>>>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>>>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
>>>
>>> The sysdeps/x86/fpu/fenv_private.h states:
>>>
>>> 296 #ifdef __x86_64__
>>> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>> 298    x86_64, so that must be set for float128 computations.  */
>>> 299 # define SET_RESTORE_ROUNDF128(RM) \
>>> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
>>>
>>> So
>>>
>>>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>> index 37c1538c08..104ace1690 100644
>>>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>> @@ -66,6 +66,7 @@
>>>>>    #include <inttypes.h>
>>>>>    #include <math-barriers.h>
>>>>>    #include <math_private.h>
>>>>> +#include <fenv_private.h>
>>>>>    #include <math-underflow.h>
>>>>>    #include <stdlib.h>
>>>>>    #include "t_expl.h"
>>>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>>>          union ieee854_long_double ex2_u, scale_u;
>>>>>          fenv_t oldenv;
>>>>>    -      feholdexcept (&oldenv);
>>>>>    #ifdef FE_TONEAREST
>>>>> -      fesetround (FE_TONEAREST);
>>>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
>>>
>>> Should be libc_feholdexcept_setroundf128.
>>
>> But it does not see to help here, so I don't know what is failing as well.
> 
> Ok, so what is happening __sfp_handle_exceptions always use 387 exception
> mode for FP_EX_OVERFLOW and FP_EX_UNDERFLOW:
> 
> config/i386/sfp-exceptions.c
> 
>   79   if (_fex & FP_EX_OVERFLOW)
>   80     {
>   81       struct fenv temp;
>   82       asm volatile ("fnstenv\t%0" : "=m" (temp));
>   83       temp.__status_word |= FP_EX_OVERFLOW;
>   84       asm volatile ("fldenv\t%0" : : "m" (temp));
>   85       asm volatile ("fwait");
>   86     }
>   87   if (_fex & FP_EX_UNDERFLOW)
>   88     {
>   89       struct fenv temp;
>   90       asm volatile ("fnstenv\t%0" : "=m" (temp));
>   91       temp.__status_word |= FP_EX_UNDERFLOW;
>   92       asm volatile ("fldenv\t%0" : : "m" (temp));
>   93       asm volatile ("fwait");
>   94     }
> Yes this looks like the mentioned disassembly.
> Different that FP_EX_INEXACT, for instance, where __SSE_MATH__ sets
> whether SSE is used or not.
> 
> So I think it is not safe to use the SSE variants for libc_*_testf128,
> as for i387 we should use the default_* instead.
> 
I've just switched to default_* in sysdeps/x86/fpu/fenv_private.h:
-#ifdef __x86_64__
+#if 0
  /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
     x86_64, so that must be set for float128 computations.  */
  # define SET_RESTORE_ROUNDF128(RM) \

But now there are 7 testfails. For some of them, the max.ulp goes up 
(over 10), but there are even worse cases. Here are shortened excerpts 
of the out files:
- math/test-float128-clog.out:
Failure: Test: Real part of: clog_towardzero 
(0x2.82b795e420b281a934c6dd315cb2p-4 + 
0xf.cd42a15bf9a361243a89663e81e8p-4 i)
  ulp       :  162259276829213363391578010288127.0000
  max.ulp   :  3.0000
Failure: Test: Real part of: clog_upward 
(0x2.82b795e420b281a934c6dd315cb2p-4 + 
0xf.cd42a15bf9a361243a89663e81e8p-4 i)
  ulp       :  162259276829213363391578010288128.0000
  max.ulp   :  4.0000

- math/test-float128-clog10.out:
Failure: Test: Real part of: clog10_downward (0x3.bea2bd62e35p-4 + 
0xf.8e3d619a8d11bfd30b038eep-4 i)
  ulp       :  4.0000
  max.ulp   :  3.0000
Failure: Test: Real part of: clog10_towardzero 
(0x2.82b795e420b281a934c6dd315cb2p-4 + 
0xf.cd42a15bf9a361243a89663e81e8p-4 i)
  ulp       :  140936617129079063283494433422698.0000
  max.ulp   :  4.0000
Failure: Test: Real part of: clog10_upward 
(0x2.82b795e420b281a934c6dd315cb2p-4 + 
0xf.cd42a15bf9a361243a89663e81e8p-4 i)
  ulp       :  140936617129079063283494433422698.0000
  max.ulp   :  4.0000

- math/test-float128-jn.out
- math/test-float128-lgamma.out
- math/test-float128-tgamma.out:
something like:
  ulp       :  12.0000
  max.ulp   :  4.0000

Failure: tgamma_upward (-0x6.ec00000000000008p+8): errno set to 0, 
expected 34 (ERANGE)

- math/test-float128-y1.out:
Failure: Test: y1_downward (0x2p+0)
  ulp       :  13.0000
  max.ulp   :  4.0000
Failure: Test: y1_towardzero (0x2p+0)
  ulp       :  6.0000
  max.ulp   :  2.0000
Failure: Test: y1_upward (0x2p+0)
  ulp       :  10.0000
  max.ulp   :  5.0000

- math/test-float128-yn.out
  
Adhemerval Zanella March 26, 2020, 2:53 p.m. UTC | #7
On 26/03/2020 06:08, Stefan Liebler via Libc-alpha wrote:
> On 3/25/20 4:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>>
>>
>> On 25/03/2020 12:07, Adhemerval Zanella wrote:
>>>
>>>
>>> On 25/03/2020 12:00, Adhemerval Zanella wrote:
>>>>
>>>>
>>>> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>>>>> Unfortunately, this patch is responsible for testfails on x86_64:
>>>>>
>>>>> math/test-float128-exp.out:
>>>>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>>>>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>>>>> ...
>>>>>
>>>>> math/test-float128-cexp.out:
>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
>>>>
>>>> The sysdeps/x86/fpu/fenv_private.h states:
>>>>
>>>> 296 #ifdef __x86_64__
>>>> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>>> 298    x86_64, so that must be set for float128 computations.  */
>>>> 299 # define SET_RESTORE_ROUNDF128(RM) \
>>>> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
>>>>
>>>> So
>>>>
>>>>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>> index 37c1538c08..104ace1690 100644
>>>>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>> @@ -66,6 +66,7 @@
>>>>>>    #include <inttypes.h>
>>>>>>    #include <math-barriers.h>
>>>>>>    #include <math_private.h>
>>>>>> +#include <fenv_private.h>
>>>>>>    #include <math-underflow.h>
>>>>>>    #include <stdlib.h>
>>>>>>    #include "t_expl.h"
>>>>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>>>>          union ieee854_long_double ex2_u, scale_u;
>>>>>>          fenv_t oldenv;
>>>>>>    -      feholdexcept (&oldenv);
>>>>>>    #ifdef FE_TONEAREST
>>>>>> -      fesetround (FE_TONEAREST);
>>>>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
>>>>
>>>> Should be libc_feholdexcept_setroundf128.
>>>
>>> But it does not see to help here, so I don't know what is failing as well.
>>
>> Ok, so what is happening __sfp_handle_exceptions always use 387 exception
>> mode for FP_EX_OVERFLOW and FP_EX_UNDERFLOW:
>>
>> config/i386/sfp-exceptions.c
>>
>>   79   if (_fex & FP_EX_OVERFLOW)
>>   80     {
>>   81       struct fenv temp;
>>   82       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>   83       temp.__status_word |= FP_EX_OVERFLOW;
>>   84       asm volatile ("fldenv\t%0" : : "m" (temp));
>>   85       asm volatile ("fwait");
>>   86     }
>>   87   if (_fex & FP_EX_UNDERFLOW)
>>   88     {
>>   89       struct fenv temp;
>>   90       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>   91       temp.__status_word |= FP_EX_UNDERFLOW;
>>   92       asm volatile ("fldenv\t%0" : : "m" (temp));
>>   93       asm volatile ("fwait");
>>   94     }
>> Yes this looks like the mentioned disassembly.
>> Different that FP_EX_INEXACT, for instance, where __SSE_MATH__ sets
>> whether SSE is used or not.
>>
>> So I think it is not safe to use the SSE variants for libc_*_testf128,
>> as for i387 we should use the default_* instead.
>>
> I've just switched to default_* in sysdeps/x86/fpu/fenv_private.h:
> -#ifdef __x86_64__
> +#if 0
>  /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>     x86_64, so that must be set for float128 computations.  */
>  # define SET_RESTORE_ROUNDF128(RM) \
> 
> But now there are 7 testfails. For some of them, the max.ulp goes up (over 10), but there are even worse cases. Here are shortened excerpts of the out files:
> - math/test-float128-clog.out:
> Failure: Test: Real part of: clog_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>  ulp       :  162259276829213363391578010288127.0000
>  max.ulp   :  3.0000
> Failure: Test: Real part of: clog_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>  ulp       :  162259276829213363391578010288128.0000
>  max.ulp   :  4.0000
> 
> - math/test-float128-clog10.out:
> Failure: Test: Real part of: clog10_downward (0x3.bea2bd62e35p-4 + 0xf.8e3d619a8d11bfd30b038eep-4 i)
>  ulp       :  4.0000
>  max.ulp   :  3.0000
> Failure: Test: Real part of: clog10_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>  ulp       :  140936617129079063283494433422698.0000
>  max.ulp   :  4.0000
> Failure: Test: Real part of: clog10_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>  ulp       :  140936617129079063283494433422698.0000
>  max.ulp   :  4.0000
> 
> - math/test-float128-jn.out
> - math/test-float128-lgamma.out
> - math/test-float128-tgamma.out:
> something like:
>  ulp       :  12.0000
>  max.ulp   :  4.0000
> 
> Failure: tgamma_upward (-0x6.ec00000000000008p+8): errno set to 0, expected 34 (ERANGE)
> 
> - math/test-float128-y1.out:
> Failure: Test: y1_downward (0x2p+0)
>  ulp       :  13.0000
>  max.ulp   :  4.0000
> Failure: Test: y1_towardzero (0x2p+0)
>  ulp       :  6.0000
>  max.ulp   :  2.0000
> Failure: Test: y1_upward (0x2p+0)
>  ulp       :  10.0000
>  max.ulp   :  5.0000
> 
> - math/test-float128-yn.out
> 

So it seems the issue is the mix on how libm fenv function, the internal
libc_fe*, and libgcc handles the exception register.  The exported fenv
operates on both i387 and SSE (since it should work on long double as well),
and the internal libc_fe* will set either SSE for float, double, and float128
and i387 for long double (as expected).

The libgcc, however, will set either SEE or i387 depending of the exception.
This broke the assumption of libc_fe* for float128 where either SSE or i387
will be used.

One option might be to force libgcc to not use its __sfp_handle_exceptions
on x86_64 and provide one that uses only SEE operations since libgcc does 
not use 'long double' on float128 operations.  The patch below does it
and applied on top your patches shows no regressions.

And I think we should fix libgcc in a similar manner, since checking on
config/i386/64/sfp-machine.h it only support SSE rounding mode.

--

diff --git a/sysdeps/x86/fpu/sfp-exceptions.c b/sysdeps/x86/fpu/sfp-exceptions.c
new file mode 100644
index 0000000000..676f396bc3
--- /dev/null
+++ b/sysdeps/x86/fpu/sfp-exceptions.c
@@ -0,0 +1,49 @@
+#include <fenv.h>
+#include <float.h>
+#include <math-barriers.h>
+
+#define FP_EX_INVALID           0x01
+#define FP_EX_DENORM            0x02
+#define FP_EX_DIVZERO           0x04
+#define FP_EX_OVERFLOW          0x08
+#define FP_EX_UNDERFLOW         0x10
+#define FP_EX_INEXACT           0x20
+#define FP_EX_ALL \
+        (FP_EX_INVALID | FP_EX_DENORM | FP_EX_DIVZERO | FP_EX_OVERFLOW \
+         | FP_EX_UNDERFLOW | FP_EX_INEXACT)
+
+void
+__sfp_handle_exceptions (int _fex)
+{
+  if (_fex & FP_EX_INVALID)
+    {
+      float f = 0.0f;
+      math_force_eval (f / f);
+    }
+  if (_fex & FP_EX_DENORM)
+    {
+      float f = FLT_MIN, g = 2.0f;
+      math_force_eval (f / g);
+    }
+  if (_fex & FP_EX_DIVZERO)
+    {
+      float f = 1.0f, g = 0.0f;
+      math_force_eval (f / g);
+    }
+  if (_fex & FP_EX_OVERFLOW)
+    {
+      float force_underflow = FLT_MAX * FLT_MAX;
+      math_force_eval (force_underflow);
+    }
+  if (_fex & FP_EX_UNDERFLOW)
+    {
+      float force_overflow = FLT_MIN * FLT_MIN;
+      math_force_eval (force_overflow);
+    }
+  if (_fex & FP_EX_INEXACT)
+    {
+      float f = 1.0f, g = 3.0f;
+      math_force_eval (f / g);
+    }
+}
+strong_alias (__sfp_handle_exceptions, __wrap___sfp_handle_exceptions)
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index a4ff2723a8..5becb96fa3 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -25,6 +25,9 @@ endif
 
 # Variables for libmvec tests.
 ifeq ($(subdir),math)
+libm-routines += sfp-exceptions
+LDFLAGS-m.so += -Wl,--wrap=__sfp_handle_exceptions
+
 ifeq ($(build-mathvec),yes)
 libmvec-tests += double-vlen2 double-vlen4 double-vlen4-avx2 \
 		 float-vlen4 float-vlen8 float-vlen8-avx2
  
Stefan Liebler March 27, 2020, 2:23 p.m. UTC | #8
On 3/26/20 3:53 PM, Adhemerval Zanella via Libc-alpha wrote:
> 
> 
> On 26/03/2020 06:08, Stefan Liebler via Libc-alpha wrote:
>> On 3/25/20 4:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>>>
>>>
>>> On 25/03/2020 12:07, Adhemerval Zanella wrote:
>>>>
>>>>
>>>> On 25/03/2020 12:00, Adhemerval Zanella wrote:
>>>>>
>>>>>
>>>>> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>>>>>> Unfortunately, this patch is responsible for testfails on x86_64:
>>>>>>
>>>>>> math/test-float128-exp.out:
>>>>>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>>>>>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>>>>>> ...
>>>>>>
>>>>>> math/test-float128-cexp.out:
>>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
>>>>>
>>>>> The sysdeps/x86/fpu/fenv_private.h states:
>>>>>
>>>>> 296 #ifdef __x86_64__
>>>>> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>>>> 298    x86_64, so that must be set for float128 computations.  */
>>>>> 299 # define SET_RESTORE_ROUNDF128(RM) \
>>>>> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
>>>>>
>>>>> So
>>>>>
>>>>>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>> index 37c1538c08..104ace1690 100644
>>>>>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>> @@ -66,6 +66,7 @@
>>>>>>>     #include <inttypes.h>
>>>>>>>     #include <math-barriers.h>
>>>>>>>     #include <math_private.h>
>>>>>>> +#include <fenv_private.h>
>>>>>>>     #include <math-underflow.h>
>>>>>>>     #include <stdlib.h>
>>>>>>>     #include "t_expl.h"
>>>>>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>>>>>           union ieee854_long_double ex2_u, scale_u;
>>>>>>>           fenv_t oldenv;
>>>>>>>     -      feholdexcept (&oldenv);
>>>>>>>     #ifdef FE_TONEAREST
>>>>>>> -      fesetround (FE_TONEAREST);
>>>>>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
>>>>>
>>>>> Should be libc_feholdexcept_setroundf128.
>>>>
>>>> But it does not see to help here, so I don't know what is failing as well.
>>>
>>> Ok, so what is happening __sfp_handle_exceptions always use 387 exception
>>> mode for FP_EX_OVERFLOW and FP_EX_UNDERFLOW:
>>>
>>> config/i386/sfp-exceptions.c
>>>
>>>    79   if (_fex & FP_EX_OVERFLOW)
>>>    80     {
>>>    81       struct fenv temp;
>>>    82       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>>    83       temp.__status_word |= FP_EX_OVERFLOW;
>>>    84       asm volatile ("fldenv\t%0" : : "m" (temp));
>>>    85       asm volatile ("fwait");
>>>    86     }
>>>    87   if (_fex & FP_EX_UNDERFLOW)
>>>    88     {
>>>    89       struct fenv temp;
>>>    90       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>>    91       temp.__status_word |= FP_EX_UNDERFLOW;
>>>    92       asm volatile ("fldenv\t%0" : : "m" (temp));
>>>    93       asm volatile ("fwait");
>>>    94     }
>>> Yes this looks like the mentioned disassembly.
>>> Different that FP_EX_INEXACT, for instance, where __SSE_MATH__ sets
>>> whether SSE is used or not.
>>>
>>> So I think it is not safe to use the SSE variants for libc_*_testf128,
>>> as for i387 we should use the default_* instead.
>>>
>> I've just switched to default_* in sysdeps/x86/fpu/fenv_private.h:
>> -#ifdef __x86_64__
>> +#if 0
>>   /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>      x86_64, so that must be set for float128 computations.  */
>>   # define SET_RESTORE_ROUNDF128(RM) \
>>
>> But now there are 7 testfails. For some of them, the max.ulp goes up (over 10), but there are even worse cases. Here are shortened excerpts of the out files:
>> - math/test-float128-clog.out:
>> Failure: Test: Real part of: clog_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>   ulp       :  162259276829213363391578010288127.0000
>>   max.ulp   :  3.0000
>> Failure: Test: Real part of: clog_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>   ulp       :  162259276829213363391578010288128.0000
>>   max.ulp   :  4.0000
>>
>> - math/test-float128-clog10.out:
>> Failure: Test: Real part of: clog10_downward (0x3.bea2bd62e35p-4 + 0xf.8e3d619a8d11bfd30b038eep-4 i)
>>   ulp       :  4.0000
>>   max.ulp   :  3.0000
>> Failure: Test: Real part of: clog10_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>   ulp       :  140936617129079063283494433422698.0000
>>   max.ulp   :  4.0000
>> Failure: Test: Real part of: clog10_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>   ulp       :  140936617129079063283494433422698.0000
>>   max.ulp   :  4.0000
>>
>> - math/test-float128-jn.out
>> - math/test-float128-lgamma.out
>> - math/test-float128-tgamma.out:
>> something like:
>>   ulp       :  12.0000
>>   max.ulp   :  4.0000
>>
>> Failure: tgamma_upward (-0x6.ec00000000000008p+8): errno set to 0, expected 34 (ERANGE)
>>
>> - math/test-float128-y1.out:
>> Failure: Test: y1_downward (0x2p+0)
>>   ulp       :  13.0000
>>   max.ulp   :  4.0000
>> Failure: Test: y1_towardzero (0x2p+0)
>>   ulp       :  6.0000
>>   max.ulp   :  2.0000
>> Failure: Test: y1_upward (0x2p+0)
>>   ulp       :  10.0000
>>   max.ulp   :  5.0000
>>
>> - math/test-float128-yn.out
>>
> 
> So it seems the issue is the mix on how libm fenv function, the internal
> libc_fe*, and libgcc handles the exception register.  The exported fenv
> operates on both i387 and SSE (since it should work on long double as well),
> and the internal libc_fe* will set either SSE for float, double, and float128
> and i387 for long double (as expected).
> 
> The libgcc, however, will set either SEE or i387 depending of the exception.
> This broke the assumption of libc_fe* for float128 where either SSE or i387
> will be used.
> 
> One option might be to force libgcc to not use its __sfp_handle_exceptions
> on x86_64 and provide one that uses only SEE operations since libgcc does
> not use 'long double' on float128 operations.  The patch below does it
> and applied on top your patches shows no regressions.

Great news. Thanks Adhemerval.
I've also successfully build and run the testsuite with your patch on 
top of mine and with only your patch without mine.

As e.g. __multf3 or __addtf3 is used in various f128 functions, can you 
please first commit your patch? Then I will add a reference to this 
commit id in the commit-message.

One other question: Why are the soft-fp functions (for add / multiply) 
called at all. Are the corresponding hardware instructions not available 
on all x86_64 machines? Or do we miss a compiler flag?
> 
> And I think we should fix libgcc in a similar manner, since checking on
> config/i386/64/sfp-machine.h it only support SSE rounding mode.
> 
> --
> 
> diff --git a/sysdeps/x86/fpu/sfp-exceptions.c b/sysdeps/x86/fpu/sfp-exceptions.c
> new file mode 100644
> index 0000000000..676f396bc3
> --- /dev/null
> +++ b/sysdeps/x86/fpu/sfp-exceptions.c
> @@ -0,0 +1,49 @@
> +#include <fenv.h>
> +#include <float.h>
> +#include <math-barriers.h>
> +
> +#define FP_EX_INVALID           0x01
> +#define FP_EX_DENORM            0x02
> +#define FP_EX_DIVZERO           0x04
> +#define FP_EX_OVERFLOW          0x08
> +#define FP_EX_UNDERFLOW         0x10
> +#define FP_EX_INEXACT           0x20
> +#define FP_EX_ALL \
> +        (FP_EX_INVALID | FP_EX_DENORM | FP_EX_DIVZERO | FP_EX_OVERFLOW \
> +         | FP_EX_UNDERFLOW | FP_EX_INEXACT)
> +
> +void
> +__sfp_handle_exceptions (int _fex)
> +{
> +  if (_fex & FP_EX_INVALID)
> +    {
> +      float f = 0.0f;
> +      math_force_eval (f / f);
> +    }
> +  if (_fex & FP_EX_DENORM)
> +    {
> +      float f = FLT_MIN, g = 2.0f;
> +      math_force_eval (f / g);
> +    }
> +  if (_fex & FP_EX_DIVZERO)
> +    {
> +      float f = 1.0f, g = 0.0f;
> +      math_force_eval (f / g);
> +    }
> +  if (_fex & FP_EX_OVERFLOW)
> +    {
> +      float force_underflow = FLT_MAX * FLT_MAX;
> +      math_force_eval (force_underflow);
> +    }
> +  if (_fex & FP_EX_UNDERFLOW)
> +    {
> +      float force_overflow = FLT_MIN * FLT_MIN;
> +      math_force_eval (force_overflow);
> +    }
> +  if (_fex & FP_EX_INEXACT)
> +    {
> +      float f = 1.0f, g = 3.0f;
> +      math_force_eval (f / g);
> +    }
> +}
> +strong_alias (__sfp_handle_exceptions, __wrap___sfp_handle_exceptions)
> diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
> index a4ff2723a8..5becb96fa3 100644
> --- a/sysdeps/x86_64/fpu/Makefile
> +++ b/sysdeps/x86_64/fpu/Makefile
> @@ -25,6 +25,9 @@ endif
> 
>   # Variables for libmvec tests.
>   ifeq ($(subdir),math)
> +libm-routines += sfp-exceptions
> +LDFLAGS-m.so += -Wl,--wrap=__sfp_handle_exceptions
> +
>   ifeq ($(build-mathvec),yes)
>   libmvec-tests += double-vlen2 double-vlen4 double-vlen4-avx2 \
>   		 float-vlen4 float-vlen8 float-vlen8-avx2
>
  
Adhemerval Zanella March 30, 2020, 6:12 p.m. UTC | #9
On 27/03/2020 11:23, Stefan Liebler via Libc-alpha wrote:
> On 3/26/20 3:53 PM, Adhemerval Zanella via Libc-alpha wrote:
>>
>>
>> On 26/03/2020 06:08, Stefan Liebler via Libc-alpha wrote:
>>> On 3/25/20 4:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>>>>
>>>>
>>>> On 25/03/2020 12:07, Adhemerval Zanella wrote:
>>>>>
>>>>>
>>>>> On 25/03/2020 12:00, Adhemerval Zanella wrote:
>>>>>>
>>>>>>
>>>>>> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>>>>>>> Unfortunately, this patch is responsible for testfails on x86_64:
>>>>>>>
>>>>>>> math/test-float128-exp.out:
>>>>>>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>>>>>>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>>>>>>> ...
>>>>>>>
>>>>>>> math/test-float128-cexp.out:
>>>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>>>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
>>>>>>
>>>>>> The sysdeps/x86/fpu/fenv_private.h states:
>>>>>>
>>>>>> 296 #ifdef __x86_64__
>>>>>> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>>>>> 298    x86_64, so that must be set for float128 computations.  */
>>>>>> 299 # define SET_RESTORE_ROUNDF128(RM) \
>>>>>> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
>>>>>>
>>>>>> So
>>>>>>
>>>>>>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>>> index 37c1538c08..104ace1690 100644
>>>>>>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>>> @@ -66,6 +66,7 @@
>>>>>>>>     #include <inttypes.h>
>>>>>>>>     #include <math-barriers.h>
>>>>>>>>     #include <math_private.h>
>>>>>>>> +#include <fenv_private.h>
>>>>>>>>     #include <math-underflow.h>
>>>>>>>>     #include <stdlib.h>
>>>>>>>>     #include "t_expl.h"
>>>>>>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>>>>>>           union ieee854_long_double ex2_u, scale_u;
>>>>>>>>           fenv_t oldenv;
>>>>>>>>     -      feholdexcept (&oldenv);
>>>>>>>>     #ifdef FE_TONEAREST
>>>>>>>> -      fesetround (FE_TONEAREST);
>>>>>>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
>>>>>>
>>>>>> Should be libc_feholdexcept_setroundf128.
>>>>>
>>>>> But it does not see to help here, so I don't know what is failing as well.
>>>>
>>>> Ok, so what is happening __sfp_handle_exceptions always use 387 exception
>>>> mode for FP_EX_OVERFLOW and FP_EX_UNDERFLOW:
>>>>
>>>> config/i386/sfp-exceptions.c
>>>>
>>>>    79   if (_fex & FP_EX_OVERFLOW)
>>>>    80     {
>>>>    81       struct fenv temp;
>>>>    82       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>>>    83       temp.__status_word |= FP_EX_OVERFLOW;
>>>>    84       asm volatile ("fldenv\t%0" : : "m" (temp));
>>>>    85       asm volatile ("fwait");
>>>>    86     }
>>>>    87   if (_fex & FP_EX_UNDERFLOW)
>>>>    88     {
>>>>    89       struct fenv temp;
>>>>    90       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>>>    91       temp.__status_word |= FP_EX_UNDERFLOW;
>>>>    92       asm volatile ("fldenv\t%0" : : "m" (temp));
>>>>    93       asm volatile ("fwait");
>>>>    94     }
>>>> Yes this looks like the mentioned disassembly.
>>>> Different that FP_EX_INEXACT, for instance, where __SSE_MATH__ sets
>>>> whether SSE is used or not.
>>>>
>>>> So I think it is not safe to use the SSE variants for libc_*_testf128,
>>>> as for i387 we should use the default_* instead.
>>>>
>>> I've just switched to default_* in sysdeps/x86/fpu/fenv_private.h:
>>> -#ifdef __x86_64__
>>> +#if 0
>>>   /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>>      x86_64, so that must be set for float128 computations.  */
>>>   # define SET_RESTORE_ROUNDF128(RM) \
>>>
>>> But now there are 7 testfails. For some of them, the max.ulp goes up (over 10), but there are even worse cases. Here are shortened excerpts of the out files:
>>> - math/test-float128-clog.out:
>>> Failure: Test: Real part of: clog_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>>   ulp       :  162259276829213363391578010288127.0000
>>>   max.ulp   :  3.0000
>>> Failure: Test: Real part of: clog_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>>   ulp       :  162259276829213363391578010288128.0000
>>>   max.ulp   :  4.0000
>>>
>>> - math/test-float128-clog10.out:
>>> Failure: Test: Real part of: clog10_downward (0x3.bea2bd62e35p-4 + 0xf.8e3d619a8d11bfd30b038eep-4 i)
>>>   ulp       :  4.0000
>>>   max.ulp   :  3.0000
>>> Failure: Test: Real part of: clog10_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>>   ulp       :  140936617129079063283494433422698.0000
>>>   max.ulp   :  4.0000
>>> Failure: Test: Real part of: clog10_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>>   ulp       :  140936617129079063283494433422698.0000
>>>   max.ulp   :  4.0000
>>>
>>> - math/test-float128-jn.out
>>> - math/test-float128-lgamma.out
>>> - math/test-float128-tgamma.out:
>>> something like:
>>>   ulp       :  12.0000
>>>   max.ulp   :  4.0000
>>>
>>> Failure: tgamma_upward (-0x6.ec00000000000008p+8): errno set to 0, expected 34 (ERANGE)
>>>
>>> - math/test-float128-y1.out:
>>> Failure: Test: y1_downward (0x2p+0)
>>>   ulp       :  13.0000
>>>   max.ulp   :  4.0000
>>> Failure: Test: y1_towardzero (0x2p+0)
>>>   ulp       :  6.0000
>>>   max.ulp   :  2.0000
>>> Failure: Test: y1_upward (0x2p+0)
>>>   ulp       :  10.0000
>>>   max.ulp   :  5.0000
>>>
>>> - math/test-float128-yn.out
>>>
>>
>> So it seems the issue is the mix on how libm fenv function, the internal
>> libc_fe*, and libgcc handles the exception register.  The exported fenv
>> operates on both i387 and SSE (since it should work on long double as well),
>> and the internal libc_fe* will set either SSE for float, double, and float128
>> and i387 for long double (as expected).
>>
>> The libgcc, however, will set either SEE or i387 depending of the exception.
>> This broke the assumption of libc_fe* for float128 where either SSE or i387
>> will be used.
>>
>> One option might be to force libgcc to not use its __sfp_handle_exceptions
>> on x86_64 and provide one that uses only SEE operations since libgcc does
>> not use 'long double' on float128 operations.  The patch below does it
>> and applied on top your patches shows no regressions.
> 
> Great news. Thanks Adhemerval.
> I've also successfully build and run the testsuite with your patch on top of mine and with only your patch without mine.
> 
> As e.g. __multf3 or __addtf3 is used in various f128 functions, can you please first commit your patch? Then I will add a reference to this commit id in the commit-message.

I will send a RFC for this patch, we need to check with x86 maintainers
if this the desirable direction and if I got everything right. 

> 
> One other question: Why are the soft-fp functions (for add / multiply) called at all. Are the corresponding hardware instructions not available on all x86_64 machines? Or do we miss a compiler flag?

The float128 on gcc/x86_64 is implemented by soft-fp library in libgcc [1]
and its ABI passes arguments through SSE register [2].

[1] https://stackoverflow.com/questions/26639477/what-exactly-is-a-float128-if-im-using-gcc-4-9-on-x86-64
[2] https://github.com/hjl-tools/x86-psABI/wiki/X86-psABI
  
Stefan Liebler March 31, 2020, 7:39 a.m. UTC | #10
On 3/30/20 8:12 PM, Adhemerval Zanella via Libc-alpha wrote:
> 
> 
> On 27/03/2020 11:23, Stefan Liebler via Libc-alpha wrote:
>> On 3/26/20 3:53 PM, Adhemerval Zanella via Libc-alpha wrote:
>>>
>>>
>>> On 26/03/2020 06:08, Stefan Liebler via Libc-alpha wrote:
>>>> On 3/25/20 4:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>>>>>
>>>>>
>>>>> On 25/03/2020 12:07, Adhemerval Zanella wrote:
>>>>>>
>>>>>>
>>>>>> On 25/03/2020 12:00, Adhemerval Zanella wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>>>>>>>> Unfortunately, this patch is responsible for testfails on x86_64:
>>>>>>>>
>>>>>>>> math/test-float128-exp.out:
>>>>>>>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>>>>>>>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>>>>>>>> ...
>>>>>>>>
>>>>>>>> math/test-float128-cexp.out:
>>>>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>>>>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
>>>>>>>
>>>>>>> The sysdeps/x86/fpu/fenv_private.h states:
>>>>>>>
>>>>>>> 296 #ifdef __x86_64__
>>>>>>> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>>>>>> 298    x86_64, so that must be set for float128 computations.  */
>>>>>>> 299 # define SET_RESTORE_ROUNDF128(RM) \
>>>>>>> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
>>>>>>>
>>>>>>> So
>>>>>>>
>>>>>>>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>>>> index 37c1538c08..104ace1690 100644
>>>>>>>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>>>>> @@ -66,6 +66,7 @@
>>>>>>>>>      #include <inttypes.h>
>>>>>>>>>      #include <math-barriers.h>
>>>>>>>>>      #include <math_private.h>
>>>>>>>>> +#include <fenv_private.h>
>>>>>>>>>      #include <math-underflow.h>
>>>>>>>>>      #include <stdlib.h>
>>>>>>>>>      #include "t_expl.h"
>>>>>>>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>>>>>>>            union ieee854_long_double ex2_u, scale_u;
>>>>>>>>>            fenv_t oldenv;
>>>>>>>>>      -      feholdexcept (&oldenv);
>>>>>>>>>      #ifdef FE_TONEAREST
>>>>>>>>> -      fesetround (FE_TONEAREST);
>>>>>>>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
>>>>>>>
>>>>>>> Should be libc_feholdexcept_setroundf128.
>>>>>>
>>>>>> But it does not see to help here, so I don't know what is failing as well.
>>>>>
>>>>> Ok, so what is happening __sfp_handle_exceptions always use 387 exception
>>>>> mode for FP_EX_OVERFLOW and FP_EX_UNDERFLOW:
>>>>>
>>>>> config/i386/sfp-exceptions.c
>>>>>
>>>>>     79   if (_fex & FP_EX_OVERFLOW)
>>>>>     80     {
>>>>>     81       struct fenv temp;
>>>>>     82       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>>>>     83       temp.__status_word |= FP_EX_OVERFLOW;
>>>>>     84       asm volatile ("fldenv\t%0" : : "m" (temp));
>>>>>     85       asm volatile ("fwait");
>>>>>     86     }
>>>>>     87   if (_fex & FP_EX_UNDERFLOW)
>>>>>     88     {
>>>>>     89       struct fenv temp;
>>>>>     90       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>>>>     91       temp.__status_word |= FP_EX_UNDERFLOW;
>>>>>     92       asm volatile ("fldenv\t%0" : : "m" (temp));
>>>>>     93       asm volatile ("fwait");
>>>>>     94     }
>>>>> Yes this looks like the mentioned disassembly.
>>>>> Different that FP_EX_INEXACT, for instance, where __SSE_MATH__ sets
>>>>> whether SSE is used or not.
>>>>>
>>>>> So I think it is not safe to use the SSE variants for libc_*_testf128,
>>>>> as for i387 we should use the default_* instead.
>>>>>
>>>> I've just switched to default_* in sysdeps/x86/fpu/fenv_private.h:
>>>> -#ifdef __x86_64__
>>>> +#if 0
>>>>    /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>>>       x86_64, so that must be set for float128 computations.  */
>>>>    # define SET_RESTORE_ROUNDF128(RM) \
>>>>
>>>> But now there are 7 testfails. For some of them, the max.ulp goes up (over 10), but there are even worse cases. Here are shortened excerpts of the out files:
>>>> - math/test-float128-clog.out:
>>>> Failure: Test: Real part of: clog_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>>>    ulp       :  162259276829213363391578010288127.0000
>>>>    max.ulp   :  3.0000
>>>> Failure: Test: Real part of: clog_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>>>    ulp       :  162259276829213363391578010288128.0000
>>>>    max.ulp   :  4.0000
>>>>
>>>> - math/test-float128-clog10.out:
>>>> Failure: Test: Real part of: clog10_downward (0x3.bea2bd62e35p-4 + 0xf.8e3d619a8d11bfd30b038eep-4 i)
>>>>    ulp       :  4.0000
>>>>    max.ulp   :  3.0000
>>>> Failure: Test: Real part of: clog10_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>>>    ulp       :  140936617129079063283494433422698.0000
>>>>    max.ulp   :  4.0000
>>>> Failure: Test: Real part of: clog10_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>>>>    ulp       :  140936617129079063283494433422698.0000
>>>>    max.ulp   :  4.0000
>>>>
>>>> - math/test-float128-jn.out
>>>> - math/test-float128-lgamma.out
>>>> - math/test-float128-tgamma.out:
>>>> something like:
>>>>    ulp       :  12.0000
>>>>    max.ulp   :  4.0000
>>>>
>>>> Failure: tgamma_upward (-0x6.ec00000000000008p+8): errno set to 0, expected 34 (ERANGE)
>>>>
>>>> - math/test-float128-y1.out:
>>>> Failure: Test: y1_downward (0x2p+0)
>>>>    ulp       :  13.0000
>>>>    max.ulp   :  4.0000
>>>> Failure: Test: y1_towardzero (0x2p+0)
>>>>    ulp       :  6.0000
>>>>    max.ulp   :  2.0000
>>>> Failure: Test: y1_upward (0x2p+0)
>>>>    ulp       :  10.0000
>>>>    max.ulp   :  5.0000
>>>>
>>>> - math/test-float128-yn.out
>>>>
>>>
>>> So it seems the issue is the mix on how libm fenv function, the internal
>>> libc_fe*, and libgcc handles the exception register.  The exported fenv
>>> operates on both i387 and SSE (since it should work on long double as well),
>>> and the internal libc_fe* will set either SSE for float, double, and float128
>>> and i387 for long double (as expected).
>>>
>>> The libgcc, however, will set either SEE or i387 depending of the exception.
>>> This broke the assumption of libc_fe* for float128 where either SSE or i387
>>> will be used.
>>>
>>> One option might be to force libgcc to not use its __sfp_handle_exceptions
>>> on x86_64 and provide one that uses only SEE operations since libgcc does
>>> not use 'long double' on float128 operations.  The patch below does it
>>> and applied on top your patches shows no regressions.
>>
>> Great news. Thanks Adhemerval.
>> I've also successfully build and run the testsuite with your patch on top of mine and with only your patch without mine.
>>
>> As e.g. __multf3 or __addtf3 is used in various f128 functions, can you please first commit your patch? Then I will add a reference to this commit id in the commit-message.
> 
> I will send a RFC for this patch, we need to check with x86 maintainers
> if this the desirable direction and if I got everything right.
> 
>>
>> One other question: Why are the soft-fp functions (for add / multiply) called at all. Are the corresponding hardware instructions not available on all x86_64 machines? Or do we miss a compiler flag?
> 
> The float128 on gcc/x86_64 is implemented by soft-fp library in libgcc [1]
> and its ABI passes arguments through SSE register [2].
> 
> [1] https://stackoverflow.com/questions/26639477/what-exactly-is-a-float128-if-im-using-gcc-4-9-on-x86-64
> [2] https://github.com/hjl-tools/x86-psABI/wiki/X86-psABI
> 

Thanks for the info and for working on this.
  

Patch

diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
index 37c1538c08..104ace1690 100644
--- a/sysdeps/ieee754/ldbl-128/e_expl.c
+++ b/sysdeps/ieee754/ldbl-128/e_expl.c
@@ -66,6 +66,7 @@ 
 #include <inttypes.h>
 #include <math-barriers.h>
 #include <math_private.h>
+#include <fenv_private.h>
 #include <math-underflow.h>
 #include <stdlib.h>
 #include "t_expl.h"
@@ -146,9 +147,10 @@  __ieee754_expl (_Float128 x)
       union ieee854_long_double ex2_u, scale_u;
       fenv_t oldenv;
 
-      feholdexcept (&oldenv);
 #ifdef FE_TONEAREST
-      fesetround (FE_TONEAREST);
+      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
+#else
+      libc_feholdexceptl (&oldenv);
 #endif
 
       /* Calculate n.  */
@@ -198,7 +200,7 @@  __ieee754_expl (_Float128 x)
       math_force_eval (x22);
 
       /* Return result.  */
-      fesetenv (&oldenv);
+      libc_fesetenvl (&oldenv);
 
       result = x22 * ex2_u.d + ex2_u.d;