x86-64: Optimize load of all bits set into ZMM register [BZ #28252]

Message ID 20210821163631.138482-1-hjl.tools@gmail.com
State Committed
Headers
Series x86-64: Optimize load of all bits set into ZMM register [BZ #28252] |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

H.J. Lu Aug. 21, 2021, 4:36 p.m. UTC
  Optimize loads of all bits set into ZMM register in AVX512 SVML codes
by replacing

	vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX

and

	vmovups   .L_2il0floatpacket.13(%rip), %zmmX

with
	vpternlogd $0xff, %zmmX, %zmmX, %zmmX

This fixes BZ #28252.
---
 .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
 .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
 .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
 .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
 .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
 .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
 .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
 .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
 .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
 .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
 10 files changed, 11 insertions(+), 64 deletions(-)
  

Comments

Noah Goldstein Aug. 21, 2021, 5:49 p.m. UTC | #1
On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha <
libc-alpha@sourceware.org> wrote:

> Optimize loads of all bits set into ZMM register in AVX512 SVML codes
> by replacing
>
>         vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
>
> and
>
>         vmovups   .L_2il0floatpacket.13(%rip), %zmmX
>
> with
>         vpternlogd $0xff, %zmmX, %zmmX, %zmmX
>
> This fixes BZ #28252.
> ---
>  .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
>  .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
>  .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
>  .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
>  .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
>  10 files changed, 11 insertions(+), 64 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> index c2cf007904..0fcb912557 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
>          vmovaps   %zmm0, %zmm8
>
>  /* Check for large arguments path */
> -        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
> +        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
>
Looking at the code it seems like this is used later by

        vpandnq   %zmm1, %zmm1, %zmm2{%k1}

AFAICT you can make the vpternlogd down there and just use

vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z}

>
>  /*
>    ARGUMENT RANGE REDUCTION:
> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
>          vmovsd    %xmm0, 1216(%rsp,%r15)
>          jmp       .LBL_2_7
>  END (_ZGVeN8v_cos_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.16:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.16,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> index e9a5d00992..5596c950ce 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
>
>  /* preserve mantissa, set input exponent to 2^(-10) */
>          vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
> -        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
> +        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
>

Earlier in the function there is a dependency breaking

kxnorw    %k3, %k3, %k3

so I think you can accomplish the same thing but breaking
some unlucky dep chain with:

vpmovm2d    %k3, %zmm2

         vpsrlq    $32, %zmm4, %zmm6
>
>  /* reciprocal approximation good to at least 11 bits */
> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
>          vmovsd    %xmm0, 1216(%rsp,%r15)
>          jmp       .LBL_2_7
>  END (_ZGVeN8v_log_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.12:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.12,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> index 508da563fe..2981f1582e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
>          andq      $-64, %rsp
>          subq      $1280, %rsp
>          movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
> -        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
> +        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
>
This one also seems to just be used by an vpandn later on:

vpandnq   %zmm13, %zmm13, %zmm14{%k1}

so maybe:
vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z}
 instead of the vpandn.

         vmovups __dAbsMask(%rax), %zmm7
>          vmovups __dInvPI(%rax), %zmm2
>          vmovups __dRShifter(%rax), %zmm1
> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
>          vmovsd    %xmm0, 1216(%rsp,%r15)
>          jmp       .LBL_2_7
>  END (_ZGVeN8v_sin_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.14:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.14,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> index 965415f2bd..4ad366373b 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
>
>  /* SinPoly = SinR*SinPoly */
>          vfmadd213pd %zmm5, %zmm5, %zmm4
> -        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
> +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>

Also vpandn below:
vpandnq   %zmm7, %zmm7, %zmm3{%k1}


>  /* Update Cos result's sign */
>          vxorpd    %zmm2, %zmm1, %zmm1
> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
>  ENTRY (_ZGVeN8vvv_sincos_skx)
>  WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
>  END (_ZGVeN8vvv_sincos_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.15:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.15,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> index cdcb16087d..b7d79efb54 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
>    X = X - Y*PI1 - Y*PI2 - Y*PI3
>   */
>          vmovaps   %zmm0, %zmm6
> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
> +        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
>

Also vpandn below:
vpandnd   %zmm1, %zmm1, %zmm12{%k1}


>          vmovups __sRShifter(%rax), %zmm3
>          vmovups __sPI1_FMA(%rax), %zmm5
>          vmovups __sA9_FMA(%rax), %zmm9
> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>          jmp       .LBL_2_7
>  END (_ZGVeN16v_cosf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.13:
> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> index 1b09909344..9f03b9b780 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
>          vmovaps   %zmm0, %zmm7
>
>  /* compare against threshold */
> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
> +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>

Also below:
vpandnd   %zmm2, %zmm2, %zmm3{%k1}

>          vmovups __sInvLn2(%rax), %zmm4
>          vmovups __sShifter(%rax), %zmm1
>          vmovups __sLn2hi(%rax), %zmm6
> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
>          jmp       .LBL_2_7
>
>  END (_ZGVeN16v_expf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.13:
> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> index 4a7b2adbbf..2ba38b0f33 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
>          andq      $-64, %rsp
>          subq      $1280, %rsp
>          movq      __svml_slog_data@GOTPCREL(%rip), %rax
> -        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
> +        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
>
Also below:
vpandnd   %zmm1, %zmm1, %zmm6{%k1}


>          vmovups _iBrkValue(%rax), %zmm4
>          vmovups _sPoly_7(%rax), %zmm8
>
> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
>          jmp       .LBL_2_7
>
>  END (_ZGVeN16v_logf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.7:
> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.7,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> index 7f906622a5..7f0272c809 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
>          vpsrlq    $32, %zmm3, %zmm2
>          vpmovqd   %zmm2, %ymm11
>          vcvtps2pd %ymm14, %zmm13
> -        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
earlier
kxnorw    %k3, %k3, %k3
can be used to get a dependency break with this:
vpmovm2d    %k3, %zmm14

>          vmovaps   %zmm14, %zmm26
>          vpandd _ABSMASK(%rax), %zmm1, %zmm8
>          vpcmpd    $1, _INF(%rax), %zmm8, %k2
> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
>          vpmovqd   %zmm11, %ymm5
>          vpxord    %zmm10, %zmm10, %zmm10
>          vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
> -        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
> +        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
>          vpxord    %zmm11, %zmm11, %zmm11
>          vcvtdq2pd %ymm7, %zmm7
>          vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>          jmp       .LBL_2_7
>  END (_ZGVeN16vv_powf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.23:
> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.23,@object
> -.L_2il0floatpacket.24:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.24,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> index 54cee3a537..e1d0154441 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
>
>  /* Result sign calculations */
>          vpternlogd $150, %zmm0, %zmm14, %zmm1
> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
>  /* Add correction term 0.5 for cos() part */
>          vaddps    %zmm8, %zmm5, %zmm15
> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
>  ENTRY (_ZGVeN16vvv_sincosf_skx)
>  WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
>  END (_ZGVeN16vvv_sincosf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.13:
> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> index ec65ffdce5..bcb76ff756 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
>          movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
>
>  /* Check for large and special values */
> -        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
Also below:
vpandnd   %zmm2, %zmm2, %zmm14{%k1}

>          vmovups __sAbsMask(%rax), %zmm5
>          vmovups __sInvPI(%rax), %zmm1
>          vmovups __sRShifter(%rax), %zmm2
> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>          jmp       .LBL_2_7
>  END (_ZGVeN16v_sinf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.11:
> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.11,@object
> --
> 2.31.1
>
>
  
H.J. Lu Aug. 21, 2021, 6:08 p.m. UTC | #2
On Sat, Aug 21, 2021 at 10:49 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote:
>>
>> Optimize loads of all bits set into ZMM register in AVX512 SVML codes
>> by replacing
>>
>>         vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
>>
>> and
>>
>>         vmovups   .L_2il0floatpacket.13(%rip), %zmmX
>>
>> with
>>         vpternlogd $0xff, %zmmX, %zmmX, %zmmX
>>
>> This fixes BZ #28252.
>> ---
>>  .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
>>  .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
>>  .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
>>  .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
>>  .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
>>  .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
>>  .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
>>  .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
>>  .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
>>  .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
>>  10 files changed, 11 insertions(+), 64 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
>> index c2cf007904..0fcb912557 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
>> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
>>          vmovaps   %zmm0, %zmm8
>>
>>  /* Check for large arguments path */
>> -        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
>> +        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
>
> Looking at the code it seems like this is used later by
>
>         vpandnq   %zmm1, %zmm1, %zmm2{%k1}
>
> AFAICT you can make the vpternlogd down there and just use
>
> vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z}
>>
>>
>>  /*
>>    ARGUMENT RANGE REDUCTION:
>> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
>>          vmovsd    %xmm0, 1216(%rsp,%r15)
>>          jmp       .LBL_2_7
>>  END (_ZGVeN8v_cos_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.16:
>> -       .long   0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.16,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
>> index e9a5d00992..5596c950ce 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
>> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
>>
>>  /* preserve mantissa, set input exponent to 2^(-10) */
>>          vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
>> -        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
>> +        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
>
>
> Earlier in the function there is a dependency breaking
>
> kxnorw    %k3, %k3, %k3
>
> so I think you can accomplish the same thing but breaking
> some unlucky dep chain with:
>
> vpmovm2d    %k3, %zmm2
>
>>          vpsrlq    $32, %zmm4, %zmm6
>>
>>  /* reciprocal approximation good to at least 11 bits */
>> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
>>          vmovsd    %xmm0, 1216(%rsp,%r15)
>>          jmp       .LBL_2_7
>>  END (_ZGVeN8v_log_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.12:
>> -       .long   0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.12,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
>> index 508da563fe..2981f1582e 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
>> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
>>          andq      $-64, %rsp
>>          subq      $1280, %rsp
>>          movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
>> -        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
>> +        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
>
> This one also seems to just be used by an vpandn later on:
>
> vpandnq   %zmm13, %zmm13, %zmm14{%k1}
>
> so maybe:
> vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z}
>  instead of the vpandn.
>
>>          vmovups __dAbsMask(%rax), %zmm7
>>          vmovups __dInvPI(%rax), %zmm2
>>          vmovups __dRShifter(%rax), %zmm1
>> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
>>          vmovsd    %xmm0, 1216(%rsp,%r15)
>>          jmp       .LBL_2_7
>>  END (_ZGVeN8v_sin_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.14:
>> -       .long   0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.14,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
>> index 965415f2bd..4ad366373b 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
>> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
>>
>>  /* SinPoly = SinR*SinPoly */
>>          vfmadd213pd %zmm5, %zmm5, %zmm4
>> -        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
>> +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>
>
> Also vpandn below:
> vpandnq   %zmm7, %zmm7, %zmm3{%k1}
>
>>
>>  /* Update Cos result's sign */
>>          vxorpd    %zmm2, %zmm1, %zmm1
>> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
>>  ENTRY (_ZGVeN8vvv_sincos_skx)
>>  WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
>>  END (_ZGVeN8vvv_sincos_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.15:
>> -       .long   0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.15,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
>> index cdcb16087d..b7d79efb54 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
>> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
>>    X = X - Y*PI1 - Y*PI2 - Y*PI3
>>   */
>>          vmovaps   %zmm0, %zmm6
>> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
>> +        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
>
>
> Also vpandn below:
> vpandnd   %zmm1, %zmm1, %zmm12{%k1}
>
>>
>>          vmovups __sRShifter(%rax), %zmm3
>>          vmovups __sPI1_FMA(%rax), %zmm5
>>          vmovups __sA9_FMA(%rax), %zmm9
>> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
>>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>>          jmp       .LBL_2_7
>>  END (_ZGVeN16v_cosf_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.13:
>> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.13,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
>> index 1b09909344..9f03b9b780 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
>> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
>>          vmovaps   %zmm0, %zmm7
>>
>>  /* compare against threshold */
>> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
>> +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>
>
> Also below:
> vpandnd   %zmm2, %zmm2, %zmm3{%k1}
>>
>>          vmovups __sInvLn2(%rax), %zmm4
>>          vmovups __sShifter(%rax), %zmm1
>>          vmovups __sLn2hi(%rax), %zmm6
>> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
>>          jmp       .LBL_2_7
>>
>>  END (_ZGVeN16v_expf_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.13:
>> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.13,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
>> index 4a7b2adbbf..2ba38b0f33 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
>> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
>>          andq      $-64, %rsp
>>          subq      $1280, %rsp
>>          movq      __svml_slog_data@GOTPCREL(%rip), %rax
>> -        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
>> +        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
>
> Also below:
> vpandnd   %zmm1, %zmm1, %zmm6{%k1}
>
>>
>>          vmovups _iBrkValue(%rax), %zmm4
>>          vmovups _sPoly_7(%rax), %zmm8
>>
>> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
>>          jmp       .LBL_2_7
>>
>>  END (_ZGVeN16v_logf_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.7:
>> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.7,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
>> index 7f906622a5..7f0272c809 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
>> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
>>          vpsrlq    $32, %zmm3, %zmm2
>>          vpmovqd   %zmm2, %ymm11
>>          vcvtps2pd %ymm14, %zmm13
>> -        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
>> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
> earlier
> kxnorw    %k3, %k3, %k3
> can be used to get a dependency break with this:
> vpmovm2d    %k3, %zmm14

The SVM codes can use some improvements.   Can you
open a separate glibc bug?  I'd like to address only all 1s
load here to avoid more complexity.

Thanks.

>>          vmovaps   %zmm14, %zmm26
>>          vpandd _ABSMASK(%rax), %zmm1, %zmm8
>>          vpcmpd    $1, _INF(%rax), %zmm8, %k2
>> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
>>          vpmovqd   %zmm11, %ymm5
>>          vpxord    %zmm10, %zmm10, %zmm10
>>          vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
>> -        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
>> +        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
>>          vpxord    %zmm11, %zmm11, %zmm11
>>          vcvtdq2pd %ymm7, %zmm7
>>          vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
>> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
>>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>>          jmp       .LBL_2_7
>>  END (_ZGVeN16vv_powf_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.23:
>> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.23,@object
>> -.L_2il0floatpacket.24:
>> -       .long   0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.24,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
>> index 54cee3a537..e1d0154441 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
>> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
>>
>>  /* Result sign calculations */
>>          vpternlogd $150, %zmm0, %zmm14, %zmm1
>> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
>> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>>
>>  /* Add correction term 0.5 for cos() part */
>>          vaddps    %zmm8, %zmm5, %zmm15
>> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
>>  ENTRY (_ZGVeN16vvv_sincosf_skx)
>>  WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
>>  END (_ZGVeN16vvv_sincosf_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.13:
>> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.13,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
>> index ec65ffdce5..bcb76ff756 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
>> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
>>          movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
>>
>>  /* Check for large and special values */
>> -        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
>> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
> Also below:
> vpandnd   %zmm2, %zmm2, %zmm14{%k1}
>>
>>          vmovups __sAbsMask(%rax), %zmm5
>>          vmovups __sInvPI(%rax), %zmm1
>>          vmovups __sRShifter(%rax), %zmm2
>> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
>>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>>          jmp       .LBL_2_7
>>  END (_ZGVeN16v_sinf_skx)
>> -
>> -       .section .rodata, "a"
>> -.L_2il0floatpacket.11:
>> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> -       .type   .L_2il0floatpacket.11,@object
>> --
>> 2.31.1
>>
  
Noah Goldstein Aug. 21, 2021, 11:30 p.m. UTC | #3
On Sat, Aug 21, 2021 at 2:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Sat, Aug 21, 2021 at 10:49 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> > On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha <
> libc-alpha@sourceware.org> wrote:
> >>
> >> Optimize loads of all bits set into ZMM register in AVX512 SVML codes
> >> by replacing
> >>
> >>         vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
> >>
> >> and
> >>
> >>         vmovups   .L_2il0floatpacket.13(%rip), %zmmX
> >>
> >> with
> >>         vpternlogd $0xff, %zmmX, %zmmX, %zmmX
> >>
> >> This fixes BZ #28252.
> >> ---
> >>  .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
> >>  .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
> >>  .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
> >>  .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
> >>  .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
> >>  .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
> >>  .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
> >>  .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
> >>  .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
> >>  .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
> >>  10 files changed, 11 insertions(+), 64 deletions(-)
> >>
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> >> index c2cf007904..0fcb912557 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> >> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
> >>          vmovaps   %zmm0, %zmm8
> >>
> >>  /* Check for large arguments path */
> >> -        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
> >> +        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
> >
> > Looking at the code it seems like this is used later by
> >
> >         vpandnq   %zmm1, %zmm1, %zmm2{%k1}
> >
> > AFAICT you can make the vpternlogd down there and just use
> >
> > vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z}
> >>
> >>
> >>  /*
> >>    ARGUMENT RANGE REDUCTION:
> >> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
> >>          vmovsd    %xmm0, 1216(%rsp,%r15)
> >>          jmp       .LBL_2_7
> >>  END (_ZGVeN8v_cos_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.16:
> >> -       .long   0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.16,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> >> index e9a5d00992..5596c950ce 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> >> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
> >>
> >>  /* preserve mantissa, set input exponent to 2^(-10) */
> >>          vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
> >> -        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
> >> +        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
> >
> >
> > Earlier in the function there is a dependency breaking
> >
> > kxnorw    %k3, %k3, %k3
> >
> > so I think you can accomplish the same thing but breaking
> > some unlucky dep chain with:
> >
> > vpmovm2d    %k3, %zmm2
> >
> >>          vpsrlq    $32, %zmm4, %zmm6
> >>
> >>  /* reciprocal approximation good to at least 11 bits */
> >> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
> >>          vmovsd    %xmm0, 1216(%rsp,%r15)
> >>          jmp       .LBL_2_7
> >>  END (_ZGVeN8v_log_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.12:
> >> -       .long   0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.12,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> >> index 508da563fe..2981f1582e 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> >> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
> >>          andq      $-64, %rsp
> >>          subq      $1280, %rsp
> >>          movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
> >> -        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
> >> +        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
> >
> > This one also seems to just be used by an vpandn later on:
> >
> > vpandnq   %zmm13, %zmm13, %zmm14{%k1}
> >
> > so maybe:
> > vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z}
> >  instead of the vpandn.
> >
> >>          vmovups __dAbsMask(%rax), %zmm7
> >>          vmovups __dInvPI(%rax), %zmm2
> >>          vmovups __dRShifter(%rax), %zmm1
> >> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
> >>          vmovsd    %xmm0, 1216(%rsp,%r15)
> >>          jmp       .LBL_2_7
> >>  END (_ZGVeN8v_sin_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.14:
> >> -       .long   0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.14,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> >> index 965415f2bd..4ad366373b 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> >> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
> >>
> >>  /* SinPoly = SinR*SinPoly */
> >>          vfmadd213pd %zmm5, %zmm5, %zmm4
> >> -        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
> >> +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> >
> >
> > Also vpandn below:
> > vpandnq   %zmm7, %zmm7, %zmm3{%k1}
> >
> >>
> >>  /* Update Cos result's sign */
> >>          vxorpd    %zmm2, %zmm1, %zmm1
> >> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
> >>  ENTRY (_ZGVeN8vvv_sincos_skx)
> >>  WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
> >>  END (_ZGVeN8vvv_sincos_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.15:
> >> -       .long   0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.15,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> >> index cdcb16087d..b7d79efb54 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> >> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
> >>    X = X - Y*PI1 - Y*PI2 - Y*PI3
> >>   */
> >>          vmovaps   %zmm0, %zmm6
> >> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
> >> +        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
> >
> >
> > Also vpandn below:
> > vpandnd   %zmm1, %zmm1, %zmm12{%k1}
> >
> >>
> >>          vmovups __sRShifter(%rax), %zmm3
> >>          vmovups __sPI1_FMA(%rax), %zmm5
> >>          vmovups __sA9_FMA(%rax), %zmm9
> >> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
> >>          vmovss    %xmm0, 1216(%rsp,%r15,8)
> >>          jmp       .LBL_2_7
> >>  END (_ZGVeN16v_cosf_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.13:
> >> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.13,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> >> index 1b09909344..9f03b9b780 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> >> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
> >>          vmovaps   %zmm0, %zmm7
> >>
> >>  /* compare against threshold */
> >> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
> >> +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> >
> >
> > Also below:
> > vpandnd   %zmm2, %zmm2, %zmm3{%k1}
> >>
> >>          vmovups __sInvLn2(%rax), %zmm4
> >>          vmovups __sShifter(%rax), %zmm1
> >>          vmovups __sLn2hi(%rax), %zmm6
> >> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
> >>          jmp       .LBL_2_7
> >>
> >>  END (_ZGVeN16v_expf_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.13:
> >> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.13,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> >> index 4a7b2adbbf..2ba38b0f33 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> >> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
> >>          andq      $-64, %rsp
> >>          subq      $1280, %rsp
> >>          movq      __svml_slog_data@GOTPCREL(%rip), %rax
> >> -        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
> >> +        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
> >
> > Also below:
> > vpandnd   %zmm1, %zmm1, %zmm6{%k1}
> >
> >>
> >>          vmovups _iBrkValue(%rax), %zmm4
> >>          vmovups _sPoly_7(%rax), %zmm8
> >>
> >> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
> >>          jmp       .LBL_2_7
> >>
> >>  END (_ZGVeN16v_logf_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.7:
> >> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.7,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> >> index 7f906622a5..7f0272c809 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> >> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >>          vpsrlq    $32, %zmm3, %zmm2
> >>          vpmovqd   %zmm2, %ymm11
> >>          vcvtps2pd %ymm14, %zmm13
> >> -        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
> >> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >
> > earlier
> > kxnorw    %k3, %k3, %k3
> > can be used to get a dependency break with this:
> > vpmovm2d    %k3, %zmm14
>
> The SVM codes can use some improvements.   Can you
> open a separate glibc bug?  I'd like to address only all 1s
> load here to avoid more complexity.
>
> Thanks.
>

Alright.

Okay with this patch.


>
> >>          vmovaps   %zmm14, %zmm26
> >>          vpandd _ABSMASK(%rax), %zmm1, %zmm8
> >>          vpcmpd    $1, _INF(%rax), %zmm8, %k2
> >> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >>          vpmovqd   %zmm11, %ymm5
> >>          vpxord    %zmm10, %zmm10, %zmm10
> >>          vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
> >> -        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
> >> +        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
> >>          vpxord    %zmm11, %zmm11, %zmm11
> >>          vcvtdq2pd %ymm7, %zmm7
> >>          vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
> >> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >>          vmovss    %xmm0, 1216(%rsp,%r15,8)
> >>          jmp       .LBL_2_7
> >>  END (_ZGVeN16vv_powf_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.23:
> >> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.23,@object
> >> -.L_2il0floatpacket.24:
> >> -       .long   0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.24,@object
> >> diff --git
> a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> >> index 54cee3a537..e1d0154441 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> >> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
> >>
> >>  /* Result sign calculations */
> >>          vpternlogd $150, %zmm0, %zmm14, %zmm1
> >> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
> >> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >>
> >>  /* Add correction term 0.5 for cos() part */
> >>          vaddps    %zmm8, %zmm5, %zmm15
> >> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
> >>  ENTRY (_ZGVeN16vvv_sincosf_skx)
> >>  WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
> >>  END (_ZGVeN16vvv_sincosf_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.13:
> >> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.13,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> >> index ec65ffdce5..bcb76ff756 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> >> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
> >>          movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
> >>
> >>  /* Check for large and special values */
> >> -        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
> >> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >
> > Also below:
> > vpandnd   %zmm2, %zmm2, %zmm14{%k1}
> >>
> >>          vmovups __sAbsMask(%rax), %zmm5
> >>          vmovups __sInvPI(%rax), %zmm1
> >>          vmovups __sRShifter(%rax), %zmm2
> >> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
> >>          vmovss    %xmm0, 1216(%rsp,%r15,8)
> >>          jmp       .LBL_2_7
> >>  END (_ZGVeN16v_sinf_skx)
> >> -
> >> -       .section .rodata, "a"
> >> -.L_2il0floatpacket.11:
> >> -       .long
>  0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> -       .type   .L_2il0floatpacket.11,@object
> >> --
> >> 2.31.1
> >>
>
>
> --
> H.J.
>
  
Sunil Pandey April 22, 2022, 9:39 p.m. UTC | #4
On Sat, Aug 21, 2021 at 9:37 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Optimize loads of all bits set into ZMM register in AVX512 SVML codes
> by replacing
>
>         vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
>
> and
>
>         vmovups   .L_2il0floatpacket.13(%rip), %zmmX
>
> with
>         vpternlogd $0xff, %zmmX, %zmmX, %zmmX
>
> This fixes BZ #28252.
> ---
>  .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
>  .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
>  .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
>  .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
>  .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
>  .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
>  10 files changed, 11 insertions(+), 64 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> index c2cf007904..0fcb912557 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
>          vmovaps   %zmm0, %zmm8
>
>  /* Check for large arguments path */
> -        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
> +        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
>
>  /*
>    ARGUMENT RANGE REDUCTION:
> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
>          vmovsd    %xmm0, 1216(%rsp,%r15)
>          jmp       .LBL_2_7
>  END (_ZGVeN8v_cos_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.16:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.16,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> index e9a5d00992..5596c950ce 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
>
>  /* preserve mantissa, set input exponent to 2^(-10) */
>          vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
> -        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
> +        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
>          vpsrlq    $32, %zmm4, %zmm6
>
>  /* reciprocal approximation good to at least 11 bits */
> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
>          vmovsd    %xmm0, 1216(%rsp,%r15)
>          jmp       .LBL_2_7
>  END (_ZGVeN8v_log_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.12:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.12,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> index 508da563fe..2981f1582e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
>          andq      $-64, %rsp
>          subq      $1280, %rsp
>          movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
> -        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
> +        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
>          vmovups __dAbsMask(%rax), %zmm7
>          vmovups __dInvPI(%rax), %zmm2
>          vmovups __dRShifter(%rax), %zmm1
> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
>          vmovsd    %xmm0, 1216(%rsp,%r15)
>          jmp       .LBL_2_7
>  END (_ZGVeN8v_sin_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.14:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.14,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> index 965415f2bd..4ad366373b 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
>
>  /* SinPoly = SinR*SinPoly */
>          vfmadd213pd %zmm5, %zmm5, %zmm4
> -        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
> +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>
>  /* Update Cos result's sign */
>          vxorpd    %zmm2, %zmm1, %zmm1
> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
>  ENTRY (_ZGVeN8vvv_sincos_skx)
>  WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
>  END (_ZGVeN8vvv_sincos_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.15:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.15,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> index cdcb16087d..b7d79efb54 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
>    X = X - Y*PI1 - Y*PI2 - Y*PI3
>   */
>          vmovaps   %zmm0, %zmm6
> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
> +        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
>          vmovups __sRShifter(%rax), %zmm3
>          vmovups __sPI1_FMA(%rax), %zmm5
>          vmovups __sA9_FMA(%rax), %zmm9
> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>          jmp       .LBL_2_7
>  END (_ZGVeN16v_cosf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.13:
> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> index 1b09909344..9f03b9b780 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
>          vmovaps   %zmm0, %zmm7
>
>  /* compare against threshold */
> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
> +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>          vmovups __sInvLn2(%rax), %zmm4
>          vmovups __sShifter(%rax), %zmm1
>          vmovups __sLn2hi(%rax), %zmm6
> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
>          jmp       .LBL_2_7
>
>  END (_ZGVeN16v_expf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.13:
> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> index 4a7b2adbbf..2ba38b0f33 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
>          andq      $-64, %rsp
>          subq      $1280, %rsp
>          movq      __svml_slog_data@GOTPCREL(%rip), %rax
> -        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
> +        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
>          vmovups _iBrkValue(%rax), %zmm4
>          vmovups _sPoly_7(%rax), %zmm8
>
> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
>          jmp       .LBL_2_7
>
>  END (_ZGVeN16v_logf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.7:
> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.7,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> index 7f906622a5..7f0272c809 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
>          vpsrlq    $32, %zmm3, %zmm2
>          vpmovqd   %zmm2, %ymm11
>          vcvtps2pd %ymm14, %zmm13
> -        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>          vmovaps   %zmm14, %zmm26
>          vpandd _ABSMASK(%rax), %zmm1, %zmm8
>          vpcmpd    $1, _INF(%rax), %zmm8, %k2
> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
>          vpmovqd   %zmm11, %ymm5
>          vpxord    %zmm10, %zmm10, %zmm10
>          vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
> -        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
> +        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
>          vpxord    %zmm11, %zmm11, %zmm11
>          vcvtdq2pd %ymm7, %zmm7
>          vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>          jmp       .LBL_2_7
>  END (_ZGVeN16vv_powf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.23:
> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.23,@object
> -.L_2il0floatpacket.24:
> -       .long   0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.24,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> index 54cee3a537..e1d0154441 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
>
>  /* Result sign calculations */
>          vpternlogd $150, %zmm0, %zmm14, %zmm1
> -        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
>  /* Add correction term 0.5 for cos() part */
>          vaddps    %zmm8, %zmm5, %zmm15
> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
>  ENTRY (_ZGVeN16vvv_sincosf_skx)
>  WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
>  END (_ZGVeN16vvv_sincosf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.13:
> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> index ec65ffdce5..bcb76ff756 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
>          movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
>
>  /* Check for large and special values */
> -        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
> +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>          vmovups __sAbsMask(%rax), %zmm5
>          vmovups __sInvPI(%rax), %zmm1
>          vmovups __sRShifter(%rax), %zmm2
> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
>          vmovss    %xmm0, 1216(%rsp,%r15,8)
>          jmp       .LBL_2_7
>  END (_ZGVeN16v_sinf_skx)
> -
> -       .section .rodata, "a"
> -.L_2il0floatpacket.11:
> -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> -       .type   .L_2il0floatpacket.11,@object
> --
> 2.31.1
>

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  
Noah Goldstein April 22, 2022, 9:42 p.m. UTC | #5
On Fri, Apr 22, 2022 at 4:40 PM Sunil Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sat, Aug 21, 2021 at 9:37 AM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Optimize loads of all bits set into ZMM register in AVX512 SVML codes
> > by replacing
> >
> >         vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
> >
> > and
> >
> >         vmovups   .L_2il0floatpacket.13(%rip), %zmmX
> >
> > with
> >         vpternlogd $0xff, %zmmX, %zmmX, %zmmX
> >
> > This fixes BZ #28252.
> > ---
> >  .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
> >  .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
> >  .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
> >  .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
> >  .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
> >  .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
> >  .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
> >  .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
> >  .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
> >  .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
> >  10 files changed, 11 insertions(+), 64 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> > index c2cf007904..0fcb912557 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> > @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
> >          vmovaps   %zmm0, %zmm8
> >
> >  /* Check for large arguments path */
> > -        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
> > +        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
> >
> >  /*
> >    ARGUMENT RANGE REDUCTION:
> > @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
> >          vmovsd    %xmm0, 1216(%rsp,%r15)
> >          jmp       .LBL_2_7
> >  END (_ZGVeN8v_cos_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.16:
> > -       .long   0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.16,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> > index e9a5d00992..5596c950ce 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> > @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
> >
> >  /* preserve mantissa, set input exponent to 2^(-10) */
> >          vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
> > -        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
> > +        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
> >          vpsrlq    $32, %zmm4, %zmm6
> >
> >  /* reciprocal approximation good to at least 11 bits */
> > @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
> >          vmovsd    %xmm0, 1216(%rsp,%r15)
> >          jmp       .LBL_2_7
> >  END (_ZGVeN8v_log_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.12:
> > -       .long   0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.12,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> > index 508da563fe..2981f1582e 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> > @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
> >          andq      $-64, %rsp
> >          subq      $1280, %rsp
> >          movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
> > -        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
> > +        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
> >          vmovups __dAbsMask(%rax), %zmm7
> >          vmovups __dInvPI(%rax), %zmm2
> >          vmovups __dRShifter(%rax), %zmm1
> > @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
> >          vmovsd    %xmm0, 1216(%rsp,%r15)
> >          jmp       .LBL_2_7
> >  END (_ZGVeN8v_sin_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.14:
> > -       .long   0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.14,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> > index 965415f2bd..4ad366373b 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> > @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
> >
> >  /* SinPoly = SinR*SinPoly */
> >          vfmadd213pd %zmm5, %zmm5, %zmm4
> > -        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
> > +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> >
> >  /* Update Cos result's sign */
> >          vxorpd    %zmm2, %zmm1, %zmm1
> > @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
> >  ENTRY (_ZGVeN8vvv_sincos_skx)
> >  WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
> >  END (_ZGVeN8vvv_sincos_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.15:
> > -       .long   0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.15,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> > index cdcb16087d..b7d79efb54 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> > @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
> >    X = X - Y*PI1 - Y*PI2 - Y*PI3
> >   */
> >          vmovaps   %zmm0, %zmm6
> > -        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
> > +        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
> >          vmovups __sRShifter(%rax), %zmm3
> >          vmovups __sPI1_FMA(%rax), %zmm5
> >          vmovups __sA9_FMA(%rax), %zmm9
> > @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
> >          vmovss    %xmm0, 1216(%rsp,%r15,8)
> >          jmp       .LBL_2_7
> >  END (_ZGVeN16v_cosf_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.13:
> > -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.13,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> > index 1b09909344..9f03b9b780 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> > @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
> >          vmovaps   %zmm0, %zmm7
> >
> >  /* compare against threshold */
> > -        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
> > +        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> >          vmovups __sInvLn2(%rax), %zmm4
> >          vmovups __sShifter(%rax), %zmm1
> >          vmovups __sLn2hi(%rax), %zmm6
> > @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
> >          jmp       .LBL_2_7
> >
> >  END (_ZGVeN16v_expf_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.13:
> > -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.13,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> > index 4a7b2adbbf..2ba38b0f33 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> > @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
> >          andq      $-64, %rsp
> >          subq      $1280, %rsp
> >          movq      __svml_slog_data@GOTPCREL(%rip), %rax
> > -        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
> > +        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
> >          vmovups _iBrkValue(%rax), %zmm4
> >          vmovups _sPoly_7(%rax), %zmm8
> >
> > @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
> >          jmp       .LBL_2_7
> >
> >  END (_ZGVeN16v_logf_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.7:
> > -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.7,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> > index 7f906622a5..7f0272c809 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> > @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >          vpsrlq    $32, %zmm3, %zmm2
> >          vpmovqd   %zmm2, %ymm11
> >          vcvtps2pd %ymm14, %zmm13
> > -        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
> > +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >          vmovaps   %zmm14, %zmm26
> >          vpandd _ABSMASK(%rax), %zmm1, %zmm8
> >          vpcmpd    $1, _INF(%rax), %zmm8, %k2
> > @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >          vpmovqd   %zmm11, %ymm5
> >          vpxord    %zmm10, %zmm10, %zmm10
> >          vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
> > -        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
> > +        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
> >          vpxord    %zmm11, %zmm11, %zmm11
> >          vcvtdq2pd %ymm7, %zmm7
> >          vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
> > @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >          vmovss    %xmm0, 1216(%rsp,%r15,8)
> >          jmp       .LBL_2_7
> >  END (_ZGVeN16vv_powf_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.23:
> > -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.23,@object
> > -.L_2il0floatpacket.24:
> > -       .long   0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.24,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> > index 54cee3a537..e1d0154441 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> > @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
> >
> >  /* Result sign calculations */
> >          vpternlogd $150, %zmm0, %zmm14, %zmm1
> > -        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
> > +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >
> >  /* Add correction term 0.5 for cos() part */
> >          vaddps    %zmm8, %zmm5, %zmm15
> > @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
> >  ENTRY (_ZGVeN16vvv_sincosf_skx)
> >  WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
> >  END (_ZGVeN16vvv_sincosf_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.13:
> > -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.13,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> > index ec65ffdce5..bcb76ff756 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> > @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
> >          movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
> >
> >  /* Check for large and special values */
> > -        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
> > +        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >          vmovups __sAbsMask(%rax), %zmm5
> >          vmovups __sInvPI(%rax), %zmm1
> >          vmovups __sRShifter(%rax), %zmm2
> > @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
> >          vmovss    %xmm0, 1216(%rsp,%r15,8)
> >          jmp       .LBL_2_7
> >  END (_ZGVeN16v_sinf_skx)
> > -
> > -       .section .rodata, "a"
> > -.L_2il0floatpacket.11:
> > -       .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > -       .type   .L_2il0floatpacket.11,@object
> > --
> > 2.31.1
> >
>
> I would like to backport this patch to release branches.
> Any comments or objections?
None by me
>
> --Sunil
  

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
index c2cf007904..0fcb912557 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -258,7 +258,7 @@  ENTRY (_ZGVeN8v_cos_skx)
         vmovaps   %zmm0, %zmm8
 
 /* Check for large arguments path */
-        vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+        vpternlogd $0xff, %zmm2, %zmm2, %zmm2
 
 /*
   ARGUMENT RANGE REDUCTION:
@@ -448,8 +448,3 @@  ENTRY (_ZGVeN8v_cos_skx)
         vmovsd    %xmm0, 1216(%rsp,%r15)
         jmp       .LBL_2_7
 END (_ZGVeN8v_cos_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.16:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.16,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
index e9a5d00992..5596c950ce 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -267,7 +267,7 @@  ENTRY (_ZGVeN8v_log_skx)
 
 /* preserve mantissa, set input exponent to 2^(-10) */
         vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
-        vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm1
         vpsrlq    $32, %zmm4, %zmm6
 
 /* reciprocal approximation good to at least 11 bits */
@@ -453,8 +453,3 @@  ENTRY (_ZGVeN8v_log_skx)
         vmovsd    %xmm0, 1216(%rsp,%r15)
         jmp       .LBL_2_7
 END (_ZGVeN8v_log_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.12:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.12,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
index 508da563fe..2981f1582e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -254,7 +254,7 @@  ENTRY (_ZGVeN8v_sin_skx)
         andq      $-64, %rsp
         subq      $1280, %rsp
         movq      __svml_d_trig_data@GOTPCREL(%rip), %rax
-        vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+        vpternlogd $0xff, %zmm1, %zmm1, %zmm14
         vmovups __dAbsMask(%rax), %zmm7
         vmovups __dInvPI(%rax), %zmm2
         vmovups __dRShifter(%rax), %zmm1
@@ -450,8 +450,3 @@  ENTRY (_ZGVeN8v_sin_skx)
         vmovsd    %xmm0, 1216(%rsp,%r15)
         jmp       .LBL_2_7
 END (_ZGVeN8v_sin_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.14:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.14,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
index 965415f2bd..4ad366373b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -423,7 +423,7 @@  ENTRY (_ZGVeN8vl8l8_sincos_skx)
 
 /* SinPoly = SinR*SinPoly */
         vfmadd213pd %zmm5, %zmm5, %zmm4
-        vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
 
 /* Update Cos result's sign */
         vxorpd    %zmm2, %zmm1, %zmm1
@@ -733,8 +733,3 @@  END (_ZGVeN8vvv_sincos_knl)
 ENTRY (_ZGVeN8vvv_sincos_skx)
 WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
 END (_ZGVeN8vvv_sincos_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.15:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.15,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
index cdcb16087d..b7d79efb54 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -271,7 +271,7 @@  ENTRY (_ZGVeN16v_cosf_skx)
   X = X - Y*PI1 - Y*PI2 - Y*PI3
  */
         vmovaps   %zmm0, %zmm6
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm12
+        vpternlogd $0xff, %zmm12, %zmm12, %zmm12
         vmovups __sRShifter(%rax), %zmm3
         vmovups __sPI1_FMA(%rax), %zmm5
         vmovups __sA9_FMA(%rax), %zmm9
@@ -445,8 +445,3 @@  ENTRY (_ZGVeN16v_cosf_skx)
         vmovss    %xmm0, 1216(%rsp,%r15,8)
         jmp       .LBL_2_7
 END (_ZGVeN16v_cosf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.13:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index 1b09909344..9f03b9b780 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -257,7 +257,7 @@  ENTRY (_ZGVeN16v_expf_skx)
         vmovaps   %zmm0, %zmm7
 
 /* compare against threshold */
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm3
+        vpternlogd $0xff, %zmm3, %zmm3, %zmm3
         vmovups __sInvLn2(%rax), %zmm4
         vmovups __sShifter(%rax), %zmm1
         vmovups __sLn2hi(%rax), %zmm6
@@ -432,8 +432,3 @@  ENTRY (_ZGVeN16v_expf_skx)
         jmp       .LBL_2_7
 
 END (_ZGVeN16v_expf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.13:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
index 4a7b2adbbf..2ba38b0f33 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -228,7 +228,7 @@  ENTRY (_ZGVeN16v_logf_skx)
         andq      $-64, %rsp
         subq      $1280, %rsp
         movq      __svml_slog_data@GOTPCREL(%rip), %rax
-        vmovups   .L_2il0floatpacket.7(%rip), %zmm6
+        vpternlogd $0xff, %zmm6, %zmm6, %zmm6
         vmovups _iBrkValue(%rax), %zmm4
         vmovups _sPoly_7(%rax), %zmm8
 
@@ -401,8 +401,3 @@  ENTRY (_ZGVeN16v_logf_skx)
         jmp       .LBL_2_7
 
 END (_ZGVeN16v_logf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.7:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.7,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
index 7f906622a5..7f0272c809 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -378,7 +378,7 @@  ENTRY (_ZGVeN16vv_powf_skx)
         vpsrlq    $32, %zmm3, %zmm2
         vpmovqd   %zmm2, %ymm11
         vcvtps2pd %ymm14, %zmm13
-        vmovups   .L_2il0floatpacket.23(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
         vmovaps   %zmm14, %zmm26
         vpandd _ABSMASK(%rax), %zmm1, %zmm8
         vpcmpd    $1, _INF(%rax), %zmm8, %k2
@@ -420,7 +420,7 @@  ENTRY (_ZGVeN16vv_powf_skx)
         vpmovqd   %zmm11, %ymm5
         vpxord    %zmm10, %zmm10, %zmm10
         vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
-        vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+        vpternlogd $0xff, %zmm4, %zmm4, %zmm4
         vpxord    %zmm11, %zmm11, %zmm11
         vcvtdq2pd %ymm7, %zmm7
         vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
@@ -635,11 +635,3 @@  ENTRY (_ZGVeN16vv_powf_skx)
         vmovss    %xmm0, 1216(%rsp,%r15,8)
         jmp       .LBL_2_7
 END (_ZGVeN16vv_powf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.23:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.23,@object
-.L_2il0floatpacket.24:
-	.long	0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.24,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
index 54cee3a537..e1d0154441 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -310,7 +310,7 @@  ENTRY (_ZGVeN16vl4l4_sincosf_skx)
 
 /* Result sign calculations */
         vpternlogd $150, %zmm0, %zmm14, %zmm1
-        vmovups   .L_2il0floatpacket.13(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
 
 /* Add correction term 0.5 for cos() part */
         vaddps    %zmm8, %zmm5, %zmm15
@@ -740,8 +740,3 @@  END (_ZGVeN16vvv_sincosf_knl)
 ENTRY (_ZGVeN16vvv_sincosf_skx)
 WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
 END (_ZGVeN16vvv_sincosf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.13:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
index ec65ffdce5..bcb76ff756 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -273,7 +273,7 @@  ENTRY (_ZGVeN16v_sinf_skx)
         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
 
 /* Check for large and special values */
-        vmovups   .L_2il0floatpacket.11(%rip), %zmm14
+        vpternlogd $0xff, %zmm14, %zmm14, %zmm14
         vmovups __sAbsMask(%rax), %zmm5
         vmovups __sInvPI(%rax), %zmm1
         vmovups __sRShifter(%rax), %zmm2
@@ -464,8 +464,3 @@  ENTRY (_ZGVeN16v_sinf_skx)
         vmovss    %xmm0, 1216(%rsp,%r15,8)
         jmp       .LBL_2_7
 END (_ZGVeN16v_sinf_skx)
-
-	.section .rodata, "a"
-.L_2il0floatpacket.11:
-	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
-	.type	.L_2il0floatpacket.11,@object