x86_64: Optimize sincos where sin/cos is optimized (bug 29193)

Message ID mvm35gp6e93.fsf@suse.de
State Committed
Commit dc1e5eeb25c4bcb1cc0c883a2d67cf93eb252478
Headers
Series x86_64: Optimize sincos where sin/cos is optimized (bug 29193) |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Andreas Schwab May 31, 2022, 1:08 p.m. UTC
  The compiler may substitute calls to sin or cos with calls to sincos, thus
we should have the same optimized implementations for sincos.  The
optimized implementations may produce results that differ, that also makes
sure that the sincos call aggrees with the sin and cos calls.
---
 sysdeps/ieee754/dbl-64/s_sincos.c            |  7 +++++
 sysdeps/x86_64/fpu/multiarch/Makefile        | 12 ++++++--
 sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c  |  3 ++
 sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c  |  3 ++
 sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c |  3 ++
 sysdeps/x86_64/fpu/multiarch/s_sincos.c      | 30 ++++++++++++++++++++
 6 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos.c
  

Comments

Adhemerval Zanella May 31, 2022, 4:27 p.m. UTC | #1
On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
> The compiler may substitute calls to sin or cos with calls to sincos, thus
> we should have the same optimized implementations for sincos.  The
> optimized implementations may produce results that differ, that also makes
> sure that the sincos call aggrees with the sin and cos calls.

LGTM, thanks.

I think we will need to fix for i686 as well, since it provides a
similar sse2 optimized version.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> ---
>  sysdeps/ieee754/dbl-64/s_sincos.c            |  7 +++++
>  sysdeps/x86_64/fpu/multiarch/Makefile        | 12 ++++++--
>  sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c  |  3 ++
>  sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c  |  3 ++
>  sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c |  3 ++
>  sysdeps/x86_64/fpu/multiarch/s_sincos.c      | 30 ++++++++++++++++++++
>  6 files changed, 55 insertions(+), 3 deletions(-)
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos.c
> 
> diff --git a/sysdeps/ieee754/dbl-64/s_sincos.c b/sysdeps/ieee754/dbl-64/s_sincos.c
> index 12c3021e66..890c137ebb 100644
> --- a/sysdeps/ieee754/dbl-64/s_sincos.c
> +++ b/sysdeps/ieee754/dbl-64/s_sincos.c
> @@ -24,10 +24,15 @@
>  #include <math-underflow.h>
>  #include <libm-alias-double.h>
>  
> +#ifndef SECTION
> +# define SECTION
> +#endif
> +
>  #define IN_SINCOS
>  #include "s_sin.c"
>  
>  void
> +SECTION
>  __sincos (double x, double *sinx, double *cosx)
>  {
>    mynumber u;
> @@ -100,4 +105,6 @@ __sincos (double x, double *sinx, double *cosx)
>  
>    *sinx = *cosx = x / x;
>  }
> +#ifndef __sincos
>  libm_alias_double (__sincos, sincos)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
> index ec796277a5..248162525b 100644
> --- a/sysdeps/x86_64/fpu/multiarch/Makefile
> +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
> @@ -10,7 +10,8 @@ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
>  			s_trunc-sse4_1 s_truncf-sse4_1
>  
>  libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
> -			e_asin-fma e_atan2-fma s_sin-fma s_tan-fma
> +			e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
> +			s_sincos-fma
>  
>  CFLAGS-e_asin-fma.c = -mfma -mavx2
>  CFLAGS-e_atan2-fma.c = -mfma -mavx2
> @@ -20,6 +21,7 @@ CFLAGS-e_pow-fma.c = -mfma -mavx2
>  CFLAGS-s_atan-fma.c = -mfma -mavx2
>  CFLAGS-s_sin-fma.c = -mfma -mavx2
>  CFLAGS-s_tan-fma.c = -mfma -mavx2
> +CFLAGS-s_sincos-fma.c = -mfma -mavx2
>  
>  libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
>  
> @@ -36,7 +38,8 @@ CFLAGS-s_cosf-fma.c = -mfma -mavx2
>  CFLAGS-s_sincosf-fma.c = -mfma -mavx2
>  
>  libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
> -			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4
> +			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
> +			s_sincos-fma4
>  
>  CFLAGS-e_asin-fma4.c = -mfma4
>  CFLAGS-e_atan2-fma4.c = -mfma4
> @@ -46,9 +49,11 @@ CFLAGS-e_pow-fma4.c = -mfma4
>  CFLAGS-s_atan-fma4.c = -mfma4
>  CFLAGS-s_sin-fma4.c = -mfma4
>  CFLAGS-s_tan-fma4.c = -mfma4
> +CFLAGS-s_sincos-fma4.c = -mfma4
>  
>  libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
> -			e_atan2-avx s_sin-avx s_tan-avx
> +			e_atan2-avx s_sin-avx s_tan-avx \
> +			s_sincos-avx
>  
>  CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
>  CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
> @@ -56,6 +61,7 @@ CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
>  CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
>  CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
>  CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
> +CFLAGS-s_sincos-avx.c = -msse2avx -DSSE2AVX
>  endif
>  
>  ifeq ($(subdir),mathvec)
> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
> new file mode 100644
> index 0000000000..debea0f619
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
> @@ -0,0 +1,3 @@
> +#define __sincos __sincos_avx
> +#define SECTION __attribute__ ((section (".text.avx")))
> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
> new file mode 100644
> index 0000000000..31610b3356
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
> @@ -0,0 +1,3 @@
> +#define __sincos __sincos_fma
> +#define SECTION __attribute__ ((section (".text.fma")))
> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
> new file mode 100644
> index 0000000000..7e8b8e6484
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
> @@ -0,0 +1,3 @@
> +#define __sincos __sincos_fma4
> +#define SECTION __attribute__ ((section (".text.fma4")))
> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos.c b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
> new file mode 100644
> index 0000000000..67c2817d68
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
> @@ -0,0 +1,30 @@
> +/* Multiple versions of sincos.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.

I think it should be only 2022 here.

> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <libm-alias-double.h>
> +
> +extern void __redirect_sincos (double, double *, double *);
> +
> +#define SYMBOL_NAME sincos
> +#include "ifunc-fma4.h"
> +
> +libc_ifunc_redirected (__redirect_sincos, __sincos, IFUNC_SELECTOR ());
> +libm_alias_double (__sincos, sincos)
> +
> +#define __sincos __sincos_sse2
> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
  
H.J. Lu May 31, 2022, 6:42 p.m. UTC | #2
On Tue, May 31, 2022 at 9:32 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
> > The compiler may substitute calls to sin or cos with calls to sincos, thus
> > we should have the same optimized implementations for sincos.  The
> > optimized implementations may produce results that differ, that also makes
> > sure that the sincos call aggrees with the sin and cos calls.
>
> LGTM, thanks.
>
> I think we will need to fix for i686 as well, since it provides a
> similar sse2 optimized version.

Since this is an optimization, should we leave i686 alone?

> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
>
> > ---
> >  sysdeps/ieee754/dbl-64/s_sincos.c            |  7 +++++
> >  sysdeps/x86_64/fpu/multiarch/Makefile        | 12 ++++++--
> >  sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c  |  3 ++
> >  sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c  |  3 ++
> >  sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c |  3 ++
> >  sysdeps/x86_64/fpu/multiarch/s_sincos.c      | 30 ++++++++++++++++++++
> >  6 files changed, 55 insertions(+), 3 deletions(-)
> >  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
> >  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
> >  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
> >  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos.c
> >
> > diff --git a/sysdeps/ieee754/dbl-64/s_sincos.c b/sysdeps/ieee754/dbl-64/s_sincos.c
> > index 12c3021e66..890c137ebb 100644
> > --- a/sysdeps/ieee754/dbl-64/s_sincos.c
> > +++ b/sysdeps/ieee754/dbl-64/s_sincos.c
> > @@ -24,10 +24,15 @@
> >  #include <math-underflow.h>
> >  #include <libm-alias-double.h>
> >
> > +#ifndef SECTION
> > +# define SECTION
> > +#endif
> > +
> >  #define IN_SINCOS
> >  #include "s_sin.c"
> >
> >  void
> > +SECTION
> >  __sincos (double x, double *sinx, double *cosx)
> >  {
> >    mynumber u;
> > @@ -100,4 +105,6 @@ __sincos (double x, double *sinx, double *cosx)
> >
> >    *sinx = *cosx = x / x;
> >  }
> > +#ifndef __sincos
> >  libm_alias_double (__sincos, sincos)
> > +#endif
> > diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
> > index ec796277a5..248162525b 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/Makefile
> > +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
> > @@ -10,7 +10,8 @@ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
> >                       s_trunc-sse4_1 s_truncf-sse4_1
> >
> >  libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
> > -                     e_asin-fma e_atan2-fma s_sin-fma s_tan-fma
> > +                     e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
> > +                     s_sincos-fma
> >
> >  CFLAGS-e_asin-fma.c = -mfma -mavx2
> >  CFLAGS-e_atan2-fma.c = -mfma -mavx2
> > @@ -20,6 +21,7 @@ CFLAGS-e_pow-fma.c = -mfma -mavx2
> >  CFLAGS-s_atan-fma.c = -mfma -mavx2
> >  CFLAGS-s_sin-fma.c = -mfma -mavx2
> >  CFLAGS-s_tan-fma.c = -mfma -mavx2
> > +CFLAGS-s_sincos-fma.c = -mfma -mavx2
> >
> >  libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
> >
> > @@ -36,7 +38,8 @@ CFLAGS-s_cosf-fma.c = -mfma -mavx2
> >  CFLAGS-s_sincosf-fma.c = -mfma -mavx2
> >
> >  libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
> > -                     e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4
> > +                     e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
> > +                     s_sincos-fma4
> >
> >  CFLAGS-e_asin-fma4.c = -mfma4
> >  CFLAGS-e_atan2-fma4.c = -mfma4
> > @@ -46,9 +49,11 @@ CFLAGS-e_pow-fma4.c = -mfma4
> >  CFLAGS-s_atan-fma4.c = -mfma4
> >  CFLAGS-s_sin-fma4.c = -mfma4
> >  CFLAGS-s_tan-fma4.c = -mfma4
> > +CFLAGS-s_sincos-fma4.c = -mfma4
> >
> >  libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
> > -                     e_atan2-avx s_sin-avx s_tan-avx
> > +                     e_atan2-avx s_sin-avx s_tan-avx \
> > +                     s_sincos-avx
> >
> >  CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
> >  CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
> > @@ -56,6 +61,7 @@ CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
> >  CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
> >  CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
> >  CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
> > +CFLAGS-s_sincos-avx.c = -msse2avx -DSSE2AVX
> >  endif
> >
> >  ifeq ($(subdir),mathvec)
> > diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
> > new file mode 100644
> > index 0000000000..debea0f619
> > --- /dev/null
> > +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
> > @@ -0,0 +1,3 @@
> > +#define __sincos __sincos_avx
> > +#define SECTION __attribute__ ((section (".text.avx")))
> > +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> > diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
> > new file mode 100644
> > index 0000000000..31610b3356
> > --- /dev/null
> > +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
> > @@ -0,0 +1,3 @@
> > +#define __sincos __sincos_fma
> > +#define SECTION __attribute__ ((section (".text.fma")))
> > +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> > diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
> > new file mode 100644
> > index 0000000000..7e8b8e6484
> > --- /dev/null
> > +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
> > @@ -0,0 +1,3 @@
> > +#define __sincos __sincos_fma4
> > +#define SECTION __attribute__ ((section (".text.fma4")))
> > +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> > diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos.c b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
> > new file mode 100644
> > index 0000000000..67c2817d68
> > --- /dev/null
> > +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
> > @@ -0,0 +1,30 @@
> > +/* Multiple versions of sincos.
> > +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
>
> I think it should be only 2022 here.
>
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <libm-alias-double.h>
> > +
> > +extern void __redirect_sincos (double, double *, double *);
> > +
> > +#define SYMBOL_NAME sincos
> > +#include "ifunc-fma4.h"
> > +
> > +libc_ifunc_redirected (__redirect_sincos, __sincos, IFUNC_SELECTOR ());
> > +libm_alias_double (__sincos, sincos)
> > +
> > +#define __sincos __sincos_sse2
> > +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
  
Adhemerval Zanella May 31, 2022, 7:08 p.m. UTC | #3
On 31/05/2022 15:42, H.J. Lu wrote:
> On Tue, May 31, 2022 at 9:32 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>>
>>
>> On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
>>> The compiler may substitute calls to sin or cos with calls to sincos, thus
>>> we should have the same optimized implementations for sincos.  The
>>> optimized implementations may produce results that differ, that also makes
>>> sure that the sincos call aggrees with the sin and cos calls.
>>
>> LGTM, thanks.
>>
>> I think we will need to fix for i686 as well, since it provides a
>> similar sse2 optimized version.
> 
> Since this is an optimization, should we leave i686 alone?

My understanding is if an architecture potentially optimizes sin or cos
and not sincos where it potentially have different results of default
implementation, gcc optimization might not hold true.

i686 has a sse2 version which is slight different than the default one
that uses the generic version.  The __cosf_ia32 does shows different
precision than __cosf_sse2:

$ cat math/test-float-cos.out
[...]
Test suite completed:
  300 test cases plus 296 tests for exception flags and
    296 tests for errno executed.
  39 errors occurred.

Also, interesting enough the generic version with -msse2 -mfpmath=sse
is slight better than the sse version we have (I had to adjust the
cosf-input to add a 'name' entry so it can show a better latency and
throughput):

# generic + -msse2 -mfpmath=sse
$ ./testrun.sh benchtests/bench-cosf
  "cosf": {
   "workload-spec2017.wrf": {
    "duration": 3.87649e+09,
    "iterations": 1.00968e+08,
    "reciprocal-throughput": 18.3391,
    "latency": 58.4474,
    "max-throughput": 5.45283e+07,
    "min-throughput": 1.71094e+07
   }
  }
  
# current assembly
$ ./testrun.sh benchtests/bench-cosf
  "cosf": {
   "workload-spec2017.wrf": {
    "duration": 3.88056e+09,
    "iterations": 1.00968e+08,
    "reciprocal-throughput": 14.8188,
    "latency": 62.0484,
    "max-throughput": 6.7482e+07,
    "min-throughput": 1.61164e+07
   }
  }

So I think we would be better to replace both i686 s_sinf-sse2.S and 
s_cosf-sse2.S with a C wrapper to use generic implementation plus 
-msse2 -mfpmath=sse, and also ass a s_sincosf optimization so gcc 
transformation does not cause any precision issue.

> 
>> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
>>
>>> ---
>>>  sysdeps/ieee754/dbl-64/s_sincos.c            |  7 +++++
>>>  sysdeps/x86_64/fpu/multiarch/Makefile        | 12 ++++++--
>>>  sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c  |  3 ++
>>>  sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c  |  3 ++
>>>  sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c |  3 ++
>>>  sysdeps/x86_64/fpu/multiarch/s_sincos.c      | 30 ++++++++++++++++++++
>>>  6 files changed, 55 insertions(+), 3 deletions(-)
>>>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
>>>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
>>>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
>>>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos.c
>>>
>>> diff --git a/sysdeps/ieee754/dbl-64/s_sincos.c b/sysdeps/ieee754/dbl-64/s_sincos.c
>>> index 12c3021e66..890c137ebb 100644
>>> --- a/sysdeps/ieee754/dbl-64/s_sincos.c
>>> +++ b/sysdeps/ieee754/dbl-64/s_sincos.c
>>> @@ -24,10 +24,15 @@
>>>  #include <math-underflow.h>
>>>  #include <libm-alias-double.h>
>>>
>>> +#ifndef SECTION
>>> +# define SECTION
>>> +#endif
>>> +
>>>  #define IN_SINCOS
>>>  #include "s_sin.c"
>>>
>>>  void
>>> +SECTION
>>>  __sincos (double x, double *sinx, double *cosx)
>>>  {
>>>    mynumber u;
>>> @@ -100,4 +105,6 @@ __sincos (double x, double *sinx, double *cosx)
>>>
>>>    *sinx = *cosx = x / x;
>>>  }
>>> +#ifndef __sincos
>>>  libm_alias_double (__sincos, sincos)
>>> +#endif
>>> diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
>>> index ec796277a5..248162525b 100644
>>> --- a/sysdeps/x86_64/fpu/multiarch/Makefile
>>> +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
>>> @@ -10,7 +10,8 @@ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
>>>                       s_trunc-sse4_1 s_truncf-sse4_1
>>>
>>>  libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
>>> -                     e_asin-fma e_atan2-fma s_sin-fma s_tan-fma
>>> +                     e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
>>> +                     s_sincos-fma
>>>
>>>  CFLAGS-e_asin-fma.c = -mfma -mavx2
>>>  CFLAGS-e_atan2-fma.c = -mfma -mavx2
>>> @@ -20,6 +21,7 @@ CFLAGS-e_pow-fma.c = -mfma -mavx2
>>>  CFLAGS-s_atan-fma.c = -mfma -mavx2
>>>  CFLAGS-s_sin-fma.c = -mfma -mavx2
>>>  CFLAGS-s_tan-fma.c = -mfma -mavx2
>>> +CFLAGS-s_sincos-fma.c = -mfma -mavx2
>>>
>>>  libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
>>>
>>> @@ -36,7 +38,8 @@ CFLAGS-s_cosf-fma.c = -mfma -mavx2
>>>  CFLAGS-s_sincosf-fma.c = -mfma -mavx2
>>>
>>>  libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
>>> -                     e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4
>>> +                     e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
>>> +                     s_sincos-fma4
>>>
>>>  CFLAGS-e_asin-fma4.c = -mfma4
>>>  CFLAGS-e_atan2-fma4.c = -mfma4
>>> @@ -46,9 +49,11 @@ CFLAGS-e_pow-fma4.c = -mfma4
>>>  CFLAGS-s_atan-fma4.c = -mfma4
>>>  CFLAGS-s_sin-fma4.c = -mfma4
>>>  CFLAGS-s_tan-fma4.c = -mfma4
>>> +CFLAGS-s_sincos-fma4.c = -mfma4
>>>
>>>  libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
>>> -                     e_atan2-avx s_sin-avx s_tan-avx
>>> +                     e_atan2-avx s_sin-avx s_tan-avx \
>>> +                     s_sincos-avx
>>>
>>>  CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
>>>  CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
>>> @@ -56,6 +61,7 @@ CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
>>>  CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
>>>  CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
>>>  CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
>>> +CFLAGS-s_sincos-avx.c = -msse2avx -DSSE2AVX
>>>  endif
>>>
>>>  ifeq ($(subdir),mathvec)
>>> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
>>> new file mode 100644
>>> index 0000000000..debea0f619
>>> --- /dev/null
>>> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
>>> @@ -0,0 +1,3 @@
>>> +#define __sincos __sincos_avx
>>> +#define SECTION __attribute__ ((section (".text.avx")))
>>> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
>>> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
>>> new file mode 100644
>>> index 0000000000..31610b3356
>>> --- /dev/null
>>> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
>>> @@ -0,0 +1,3 @@
>>> +#define __sincos __sincos_fma
>>> +#define SECTION __attribute__ ((section (".text.fma")))
>>> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
>>> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
>>> new file mode 100644
>>> index 0000000000..7e8b8e6484
>>> --- /dev/null
>>> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
>>> @@ -0,0 +1,3 @@
>>> +#define __sincos __sincos_fma4
>>> +#define SECTION __attribute__ ((section (".text.fma4")))
>>> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
>>> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos.c b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
>>> new file mode 100644
>>> index 0000000000..67c2817d68
>>> --- /dev/null
>>> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
>>> @@ -0,0 +1,30 @@
>>> +/* Multiple versions of sincos.
>>> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
>>
>> I think it should be only 2022 here.
>>
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <https://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <libm-alias-double.h>
>>> +
>>> +extern void __redirect_sincos (double, double *, double *);
>>> +
>>> +#define SYMBOL_NAME sincos
>>> +#include "ifunc-fma4.h"
>>> +
>>> +libc_ifunc_redirected (__redirect_sincos, __sincos, IFUNC_SELECTOR ());
>>> +libm_alias_double (__sincos, sincos)
>>> +
>>> +#define __sincos __sincos_sse2
>>> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> 
> 
>
  
H.J. Lu May 31, 2022, 7:16 p.m. UTC | #4
On Tue, May 31, 2022 at 12:08 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 31/05/2022 15:42, H.J. Lu wrote:
> > On Tue, May 31, 2022 at 9:32 AM Adhemerval Zanella via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >>
> >>
> >> On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
> >>> The compiler may substitute calls to sin or cos with calls to sincos, thus
> >>> we should have the same optimized implementations for sincos.  The
> >>> optimized implementations may produce results that differ, that also makes
> >>> sure that the sincos call aggrees with the sin and cos calls.
> >>
> >> LGTM, thanks.
> >>
> >> I think we will need to fix for i686 as well, since it provides a
> >> similar sse2 optimized version.
> >
> > Since this is an optimization, should we leave i686 alone?
>
> My understanding is if an architecture potentially optimizes sin or cos
> and not sincos where it potentially have different results of default
> implementation, gcc optimization might not hold true.
>
> i686 has a sse2 version which is slight different than the default one
> that uses the generic version.  The __cosf_ia32 does shows different
> precision than __cosf_sse2:
>
> $ cat math/test-float-cos.out
> [...]
> Test suite completed:
>   300 test cases plus 296 tests for exception flags and
>     296 tests for errno executed.
>   39 errors occurred.

So, SSE2 version has higher accuracy.  If it is the case, the benefit
of generic implementation may be limited.

> Also, interesting enough the generic version with -msse2 -mfpmath=sse
> is slight better than the sse version we have (I had to adjust the
> cosf-input to add a 'name' entry so it can show a better latency and
> throughput):
>
> # generic + -msse2 -mfpmath=sse
> $ ./testrun.sh benchtests/bench-cosf
>   "cosf": {
>    "workload-spec2017.wrf": {
>     "duration": 3.87649e+09,
>     "iterations": 1.00968e+08,
>     "reciprocal-throughput": 18.3391,
>     "latency": 58.4474,
>     "max-throughput": 5.45283e+07,
>     "min-throughput": 1.71094e+07
>    }
>   }
>
> # current assembly
> $ ./testrun.sh benchtests/bench-cosf
>   "cosf": {
>    "workload-spec2017.wrf": {
>     "duration": 3.88056e+09,
>     "iterations": 1.00968e+08,
>     "reciprocal-throughput": 14.8188,
>     "latency": 62.0484,
>     "max-throughput": 6.7482e+07,
>     "min-throughput": 1.61164e+07
>    }
>   }
>
> So I think we would be better to replace both i686 s_sinf-sse2.S and
> s_cosf-sse2.S with a C wrapper to use generic implementation plus
> -msse2 -mfpmath=sse, and also ass a s_sincosf optimization so gcc
> transformation does not cause any precision issue.
>
> >
> >> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
> >>
> >>> ---
> >>>  sysdeps/ieee754/dbl-64/s_sincos.c            |  7 +++++
> >>>  sysdeps/x86_64/fpu/multiarch/Makefile        | 12 ++++++--
> >>>  sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c  |  3 ++
> >>>  sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c  |  3 ++
> >>>  sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c |  3 ++
> >>>  sysdeps/x86_64/fpu/multiarch/s_sincos.c      | 30 ++++++++++++++++++++
> >>>  6 files changed, 55 insertions(+), 3 deletions(-)
> >>>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
> >>>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
> >>>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
> >>>  create mode 100644 sysdeps/x86_64/fpu/multiarch/s_sincos.c
> >>>
> >>> diff --git a/sysdeps/ieee754/dbl-64/s_sincos.c b/sysdeps/ieee754/dbl-64/s_sincos.c
> >>> index 12c3021e66..890c137ebb 100644
> >>> --- a/sysdeps/ieee754/dbl-64/s_sincos.c
> >>> +++ b/sysdeps/ieee754/dbl-64/s_sincos.c
> >>> @@ -24,10 +24,15 @@
> >>>  #include <math-underflow.h>
> >>>  #include <libm-alias-double.h>
> >>>
> >>> +#ifndef SECTION
> >>> +# define SECTION
> >>> +#endif
> >>> +
> >>>  #define IN_SINCOS
> >>>  #include "s_sin.c"
> >>>
> >>>  void
> >>> +SECTION
> >>>  __sincos (double x, double *sinx, double *cosx)
> >>>  {
> >>>    mynumber u;
> >>> @@ -100,4 +105,6 @@ __sincos (double x, double *sinx, double *cosx)
> >>>
> >>>    *sinx = *cosx = x / x;
> >>>  }
> >>> +#ifndef __sincos
> >>>  libm_alias_double (__sincos, sincos)
> >>> +#endif
> >>> diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
> >>> index ec796277a5..248162525b 100644
> >>> --- a/sysdeps/x86_64/fpu/multiarch/Makefile
> >>> +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
> >>> @@ -10,7 +10,8 @@ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
> >>>                       s_trunc-sse4_1 s_truncf-sse4_1
> >>>
> >>>  libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
> >>> -                     e_asin-fma e_atan2-fma s_sin-fma s_tan-fma
> >>> +                     e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
> >>> +                     s_sincos-fma
> >>>
> >>>  CFLAGS-e_asin-fma.c = -mfma -mavx2
> >>>  CFLAGS-e_atan2-fma.c = -mfma -mavx2
> >>> @@ -20,6 +21,7 @@ CFLAGS-e_pow-fma.c = -mfma -mavx2
> >>>  CFLAGS-s_atan-fma.c = -mfma -mavx2
> >>>  CFLAGS-s_sin-fma.c = -mfma -mavx2
> >>>  CFLAGS-s_tan-fma.c = -mfma -mavx2
> >>> +CFLAGS-s_sincos-fma.c = -mfma -mavx2
> >>>
> >>>  libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
> >>>
> >>> @@ -36,7 +38,8 @@ CFLAGS-s_cosf-fma.c = -mfma -mavx2
> >>>  CFLAGS-s_sincosf-fma.c = -mfma -mavx2
> >>>
> >>>  libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
> >>> -                     e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4
> >>> +                     e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
> >>> +                     s_sincos-fma4
> >>>
> >>>  CFLAGS-e_asin-fma4.c = -mfma4
> >>>  CFLAGS-e_atan2-fma4.c = -mfma4
> >>> @@ -46,9 +49,11 @@ CFLAGS-e_pow-fma4.c = -mfma4
> >>>  CFLAGS-s_atan-fma4.c = -mfma4
> >>>  CFLAGS-s_sin-fma4.c = -mfma4
> >>>  CFLAGS-s_tan-fma4.c = -mfma4
> >>> +CFLAGS-s_sincos-fma4.c = -mfma4
> >>>
> >>>  libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
> >>> -                     e_atan2-avx s_sin-avx s_tan-avx
> >>> +                     e_atan2-avx s_sin-avx s_tan-avx \
> >>> +                     s_sincos-avx
> >>>
> >>>  CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
> >>>  CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
> >>> @@ -56,6 +61,7 @@ CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
> >>>  CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
> >>>  CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
> >>>  CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
> >>> +CFLAGS-s_sincos-avx.c = -msse2avx -DSSE2AVX
> >>>  endif
> >>>
> >>>  ifeq ($(subdir),mathvec)
> >>> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
> >>> new file mode 100644
> >>> index 0000000000..debea0f619
> >>> --- /dev/null
> >>> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
> >>> @@ -0,0 +1,3 @@
> >>> +#define __sincos __sincos_avx
> >>> +#define SECTION __attribute__ ((section (".text.avx")))
> >>> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> >>> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
> >>> new file mode 100644
> >>> index 0000000000..31610b3356
> >>> --- /dev/null
> >>> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
> >>> @@ -0,0 +1,3 @@
> >>> +#define __sincos __sincos_fma
> >>> +#define SECTION __attribute__ ((section (".text.fma")))
> >>> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> >>> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
> >>> new file mode 100644
> >>> index 0000000000..7e8b8e6484
> >>> --- /dev/null
> >>> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
> >>> @@ -0,0 +1,3 @@
> >>> +#define __sincos __sincos_fma4
> >>> +#define SECTION __attribute__ ((section (".text.fma4")))
> >>> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> >>> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos.c b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
> >>> new file mode 100644
> >>> index 0000000000..67c2817d68
> >>> --- /dev/null
> >>> +++ b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
> >>> @@ -0,0 +1,30 @@
> >>> +/* Multiple versions of sincos.
> >>> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> >>
> >> I think it should be only 2022 here.
> >>
> >>> +   This file is part of the GNU C Library.
> >>> +
> >>> +   The GNU C Library is free software; you can redistribute it and/or
> >>> +   modify it under the terms of the GNU Lesser General Public
> >>> +   License as published by the Free Software Foundation; either
> >>> +   version 2.1 of the License, or (at your option) any later version.
> >>> +
> >>> +   The GNU C Library is distributed in the hope that it will be useful,
> >>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >>> +   Lesser General Public License for more details.
> >>> +
> >>> +   You should have received a copy of the GNU Lesser General Public
> >>> +   License along with the GNU C Library; if not, see
> >>> +   <https://www.gnu.org/licenses/>.  */
> >>> +
> >>> +#include <libm-alias-double.h>
> >>> +
> >>> +extern void __redirect_sincos (double, double *, double *);
> >>> +
> >>> +#define SYMBOL_NAME sincos
> >>> +#include "ifunc-fma4.h"
> >>> +
> >>> +libc_ifunc_redirected (__redirect_sincos, __sincos, IFUNC_SELECTOR ());
> >>> +libm_alias_double (__sincos, sincos)
> >>> +
> >>> +#define __sincos __sincos_sse2
> >>> +#include <sysdeps/ieee754/dbl-64/s_sincos.c>
> >
> >
> >
  
Adhemerval Zanella May 31, 2022, 7:20 p.m. UTC | #5
On 31/05/2022 16:16, H.J. Lu wrote:
> On Tue, May 31, 2022 at 12:08 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 31/05/2022 15:42, H.J. Lu wrote:
>>> On Tue, May 31, 2022 at 9:32 AM Adhemerval Zanella via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>>
>>>>
>>>>
>>>> On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
>>>>> The compiler may substitute calls to sin or cos with calls to sincos, thus
>>>>> we should have the same optimized implementations for sincos.  The
>>>>> optimized implementations may produce results that differ, that also makes
>>>>> sure that the sincos call aggrees with the sin and cos calls.
>>>>
>>>> LGTM, thanks.
>>>>
>>>> I think we will need to fix for i686 as well, since it provides a
>>>> similar sse2 optimized version.
>>>
>>> Since this is an optimization, should we leave i686 alone?
>>
>> My understanding is if an architecture potentially optimizes sin or cos
>> and not sincos where it potentially have different results of default
>> implementation, gcc optimization might not hold true.
>>
>> i686 has a sse2 version which is slight different than the default one
>> that uses the generic version.  The __cosf_ia32 does shows different
>> precision than __cosf_sse2:
>>
>> $ cat math/test-float-cos.out
>> [...]
>> Test suite completed:
>>   300 test cases plus 296 tests for exception flags and
>>     296 tests for errno executed.
>>   39 errors occurred.
> 
> So, SSE2 version has higher accuracy.  If it is the case, the benefit
> of generic implementation may be limited.

Yes, but it was an acceptable change when the generic implementation was
replaced on 2.28.  I think moving from assembly routines, specially when
resulting performance is similar, is desirable.
  
H.J. Lu May 31, 2022, 7:28 p.m. UTC | #6
On Tue, May 31, 2022 at 12:20 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 31/05/2022 16:16, H.J. Lu wrote:
> > On Tue, May 31, 2022 at 12:08 PM Adhemerval Zanella
> > <adhemerval.zanella@linaro.org> wrote:
> >>
> >>
> >>
> >> On 31/05/2022 15:42, H.J. Lu wrote:
> >>> On Tue, May 31, 2022 at 9:32 AM Adhemerval Zanella via Libc-alpha
> >>> <libc-alpha@sourceware.org> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
> >>>>> The compiler may substitute calls to sin or cos with calls to sincos, thus
> >>>>> we should have the same optimized implementations for sincos.  The
> >>>>> optimized implementations may produce results that differ, that also makes
> >>>>> sure that the sincos call aggrees with the sin and cos calls.
> >>>>
> >>>> LGTM, thanks.
> >>>>
> >>>> I think we will need to fix for i686 as well, since it provides a
> >>>> similar sse2 optimized version.
> >>>
> >>> Since this is an optimization, should we leave i686 alone?
> >>
> >> My understanding is if an architecture potentially optimizes sin or cos
> >> and not sincos where it potentially have different results of default
> >> implementation, gcc optimization might not hold true.
> >>
> >> i686 has a sse2 version which is slight different than the default one
> >> that uses the generic version.  The __cosf_ia32 does shows different
> >> precision than __cosf_sse2:
> >>
> >> $ cat math/test-float-cos.out
> >> [...]
> >> Test suite completed:
> >>   300 test cases plus 296 tests for exception flags and
> >>     296 tests for errno executed.
> >>   39 errors occurred.
> >
> > So, SSE2 version has higher accuracy.  If it is the case, the benefit
> > of generic implementation may be limited.
>
> Yes, but it was an acceptable change when the generic implementation was
> replaced on 2.28.  I think moving from assembly routines, specially when
> resulting performance is similar, is desirable.

I am not against removing SSE2 implementation.  It is just that user
benefits are limited.
  
Adhemerval Zanella May 31, 2022, 7:45 p.m. UTC | #7
On 31/05/2022 16:28, H.J. Lu wrote:
> On Tue, May 31, 2022 at 12:20 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 31/05/2022 16:16, H.J. Lu wrote:
>>> On Tue, May 31, 2022 at 12:08 PM Adhemerval Zanella
>>> <adhemerval.zanella@linaro.org> wrote:
>>>>
>>>>
>>>>
>>>> On 31/05/2022 15:42, H.J. Lu wrote:
>>>>> On Tue, May 31, 2022 at 9:32 AM Adhemerval Zanella via Libc-alpha
>>>>> <libc-alpha@sourceware.org> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
>>>>>>> The compiler may substitute calls to sin or cos with calls to sincos, thus
>>>>>>> we should have the same optimized implementations for sincos.  The
>>>>>>> optimized implementations may produce results that differ, that also makes
>>>>>>> sure that the sincos call aggrees with the sin and cos calls.
>>>>>>
>>>>>> LGTM, thanks.
>>>>>>
>>>>>> I think we will need to fix for i686 as well, since it provides a
>>>>>> similar sse2 optimized version.
>>>>>
>>>>> Since this is an optimization, should we leave i686 alone?
>>>>
>>>> My understanding is if an architecture potentially optimizes sin or cos
>>>> and not sincos where it potentially have different results of default
>>>> implementation, gcc optimization might not hold true.
>>>>
>>>> i686 has a sse2 version which is slight different than the default one
>>>> that uses the generic version.  The __cosf_ia32 does shows different
>>>> precision than __cosf_sse2:
>>>>
>>>> $ cat math/test-float-cos.out
>>>> [...]
>>>> Test suite completed:
>>>>   300 test cases plus 296 tests for exception flags and
>>>>     296 tests for errno executed.
>>>>   39 errors occurred.
>>>
>>> So, SSE2 version has higher accuracy.  If it is the case, the benefit
>>> of generic implementation may be limited.
>>
>> Yes, but it was an acceptable change when the generic implementation was
>> replaced on 2.28.  I think moving from assembly routines, specially when
>> resulting performance is similar, is desirable.
> 
> I am not against removing SSE2 implementation.  It is just that user
> benefits are limited.
> 

At least with recent GCC, generic shows slight lower latency (58.04 vs 62.04)
and better throughput (18.33 vs 14.81).  I tested with gcc 11.2.1 on a 
Ryzen 9 5900X, but I think it would show similar results on recent Intel
chips.
  
Adhemerval Zanella May 31, 2022, 7:45 p.m. UTC | #8
On 31/05/2022 16:28, H.J. Lu wrote:
> On Tue, May 31, 2022 at 12:20 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 31/05/2022 16:16, H.J. Lu wrote:
>>> On Tue, May 31, 2022 at 12:08 PM Adhemerval Zanella
>>> <adhemerval.zanella@linaro.org> wrote:
>>>>
>>>>
>>>>
>>>> On 31/05/2022 15:42, H.J. Lu wrote:
>>>>> On Tue, May 31, 2022 at 9:32 AM Adhemerval Zanella via Libc-alpha
>>>>> <libc-alpha@sourceware.org> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
>>>>>>> The compiler may substitute calls to sin or cos with calls to sincos, thus
>>>>>>> we should have the same optimized implementations for sincos.  The
>>>>>>> optimized implementations may produce results that differ, that also makes
>>>>>>> sure that the sincos call aggrees with the sin and cos calls.
>>>>>>
>>>>>> LGTM, thanks.
>>>>>>
>>>>>> I think we will need to fix for i686 as well, since it provides a
>>>>>> similar sse2 optimized version.
>>>>>
>>>>> Since this is an optimization, should we leave i686 alone?
>>>>
>>>> My understanding is if an architecture potentially optimizes sin or cos
>>>> and not sincos where it potentially have different results of default
>>>> implementation, gcc optimization might not hold true.
>>>>
>>>> i686 has a sse2 version which is slight different than the default one
>>>> that uses the generic version.  The __cosf_ia32 does shows different
>>>> precision than __cosf_sse2:
>>>>
>>>> $ cat math/test-float-cos.out
>>>> [...]
>>>> Test suite completed:
>>>>   300 test cases plus 296 tests for exception flags and
>>>>     296 tests for errno executed.
>>>>   39 errors occurred.
>>>
>>> So, SSE2 version has higher accuracy.  If it is the case, the benefit
>>> of generic implementation may be limited.
>>
>> Yes, but it was an acceptable change when the generic implementation was
>> replaced on 2.28.  I think moving from assembly routines, specially when
>> resulting performance is similar, is desirable.
> 
> I am not against removing SSE2 implementation.  It is just that user
> benefits are limited.
> 

At least with recent GCC, generic shows slight lower latency (58.04 vs 62.04)
and better throughput (18.33 vs 14.81).  I tested with gcc 11.2.1 on a 
Ryzen 9 5900X, but I think it would show similar results on recent Intel
chips.
  
Adhemerval Zanella May 31, 2022, 9:15 p.m. UTC | #9
On 31/05/2022 13:27, Adhemerval Zanella wrote:
> 
> 
> On 31/05/2022 10:08, Andreas Schwab via Libc-alpha wrote:
>> The compiler may substitute calls to sin or cos with calls to sincos, thus
>> we should have the same optimized implementations for sincos.  The
>> optimized implementations may produce results that differ, that also makes
>> sure that the sincos call aggrees with the sin and cos calls.
> 
> LGTM, thanks.
> 
> I think we will need to fix for i686 as well, since it provides a
> similar sse2 optimized version.

i686 seems to use the same algorithm for cosf, sinf, and sincosf.
  

Patch

diff --git a/sysdeps/ieee754/dbl-64/s_sincos.c b/sysdeps/ieee754/dbl-64/s_sincos.c
index 12c3021e66..890c137ebb 100644
--- a/sysdeps/ieee754/dbl-64/s_sincos.c
+++ b/sysdeps/ieee754/dbl-64/s_sincos.c
@@ -24,10 +24,15 @@ 
 #include <math-underflow.h>
 #include <libm-alias-double.h>
 
+#ifndef SECTION
+# define SECTION
+#endif
+
 #define IN_SINCOS
 #include "s_sin.c"
 
 void
+SECTION
 __sincos (double x, double *sinx, double *cosx)
 {
   mynumber u;
@@ -100,4 +105,6 @@  __sincos (double x, double *sinx, double *cosx)
 
   *sinx = *cosx = x / x;
 }
+#ifndef __sincos
 libm_alias_double (__sincos, sincos)
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index ec796277a5..248162525b 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -10,7 +10,8 @@  libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
 			s_trunc-sse4_1 s_truncf-sse4_1
 
 libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
-			e_asin-fma e_atan2-fma s_sin-fma s_tan-fma
+			e_asin-fma e_atan2-fma s_sin-fma s_tan-fma \
+			s_sincos-fma
 
 CFLAGS-e_asin-fma.c = -mfma -mavx2
 CFLAGS-e_atan2-fma.c = -mfma -mavx2
@@ -20,6 +21,7 @@  CFLAGS-e_pow-fma.c = -mfma -mavx2
 CFLAGS-s_atan-fma.c = -mfma -mavx2
 CFLAGS-s_sin-fma.c = -mfma -mavx2
 CFLAGS-s_tan-fma.c = -mfma -mavx2
+CFLAGS-s_sincos-fma.c = -mfma -mavx2
 
 libm-sysdep_routines += s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
 
@@ -36,7 +38,8 @@  CFLAGS-s_cosf-fma.c = -mfma -mavx2
 CFLAGS-s_sincosf-fma.c = -mfma -mavx2
 
 libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
-			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4
+			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
+			s_sincos-fma4
 
 CFLAGS-e_asin-fma4.c = -mfma4
 CFLAGS-e_atan2-fma4.c = -mfma4
@@ -46,9 +49,11 @@  CFLAGS-e_pow-fma4.c = -mfma4
 CFLAGS-s_atan-fma4.c = -mfma4
 CFLAGS-s_sin-fma4.c = -mfma4
 CFLAGS-s_tan-fma4.c = -mfma4
+CFLAGS-s_sincos-fma4.c = -mfma4
 
 libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
-			e_atan2-avx s_sin-avx s_tan-avx
+			e_atan2-avx s_sin-avx s_tan-avx \
+			s_sincos-avx
 
 CFLAGS-e_atan2-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-e_exp-avx.c = -msse2avx -DSSE2AVX
@@ -56,6 +61,7 @@  CFLAGS-e_log-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-s_atan-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-s_sin-avx.c = -msse2avx -DSSE2AVX
 CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX
+CFLAGS-s_sincos-avx.c = -msse2avx -DSSE2AVX
 endif
 
 ifeq ($(subdir),mathvec)
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
new file mode 100644
index 0000000000..debea0f619
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-avx.c
@@ -0,0 +1,3 @@ 
+#define __sincos __sincos_avx
+#define SECTION __attribute__ ((section (".text.avx")))
+#include <sysdeps/ieee754/dbl-64/s_sincos.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
new file mode 100644
index 0000000000..31610b3356
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma.c
@@ -0,0 +1,3 @@ 
+#define __sincos __sincos_fma
+#define SECTION __attribute__ ((section (".text.fma")))
+#include <sysdeps/ieee754/dbl-64/s_sincos.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
new file mode 100644
index 0000000000..7e8b8e6484
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_sincos-fma4.c
@@ -0,0 +1,3 @@ 
+#define __sincos __sincos_fma4
+#define SECTION __attribute__ ((section (".text.fma4")))
+#include <sysdeps/ieee754/dbl-64/s_sincos.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_sincos.c b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
new file mode 100644
index 0000000000..67c2817d68
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_sincos.c
@@ -0,0 +1,30 @@ 
+/* Multiple versions of sincos.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <libm-alias-double.h>
+
+extern void __redirect_sincos (double, double *, double *);
+
+#define SYMBOL_NAME sincos
+#include "ifunc-fma4.h"
+
+libc_ifunc_redirected (__redirect_sincos, __sincos, IFUNC_SELECTOR ());
+libm_alias_double (__sincos, sincos)
+
+#define __sincos __sincos_sse2
+#include <sysdeps/ieee754/dbl-64/s_sincos.c>