x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Optimize loads of all bits set into ZMM register in AVX512 SVML codes
by replacing
vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
and
vmovups .L_2il0floatpacket.13(%rip), %zmmX
with
vpternlogd $0xff, %zmmX, %zmmX, %zmmX
This fixes BZ #28252.
---
.../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
.../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
.../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
10 files changed, 11 insertions(+), 64 deletions(-)
Comments
On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha <
libc-alpha@sourceware.org> wrote:
> Optimize loads of all bits set into ZMM register in AVX512 SVML codes
> by replacing
>
> vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
>
> and
>
> vmovups .L_2il0floatpacket.13(%rip), %zmmX
>
> with
> vpternlogd $0xff, %zmmX, %zmmX, %zmmX
>
> This fixes BZ #28252.
> ---
> .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
> .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
> .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
> 10 files changed, 11 insertions(+), 64 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> index c2cf007904..0fcb912557 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
> vmovaps %zmm0, %zmm8
>
> /* Check for large arguments path */
> - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
> + vpternlogd $0xff, %zmm2, %zmm2, %zmm2
>
Looking at the code it seems like this is used later by
vpandnq %zmm1, %zmm1, %zmm2{%k1}
AFAICT you can make the vpternlogd down there and just use
vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z}
>
> /*
> ARGUMENT RANGE REDUCTION:
> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
> vmovsd %xmm0, 1216(%rsp,%r15)
> jmp .LBL_2_7
> END (_ZGVeN8v_cos_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.16:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.16,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> index e9a5d00992..5596c950ce 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
>
> /* preserve mantissa, set input exponent to 2^(-10) */
> vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
> - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
> + vpternlogd $0xff, %zmm1, %zmm1, %zmm1
>
Earlier in the function there is a dependency breaking
kxnorw %k3, %k3, %k3
so I think you can accomplish the same thing but breaking
some unlucky dep chain with:
vpmovm2d %k3, %zmm2
vpsrlq $32, %zmm4, %zmm6
>
> /* reciprocal approximation good to at least 11 bits */
> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
> vmovsd %xmm0, 1216(%rsp,%r15)
> jmp .LBL_2_7
> END (_ZGVeN8v_log_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.12:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.12,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> index 508da563fe..2981f1582e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
> andq $-64, %rsp
> subq $1280, %rsp
> movq __svml_d_trig_data@GOTPCREL(%rip), %rax
> - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
> + vpternlogd $0xff, %zmm1, %zmm1, %zmm14
>
This one also seems to just be used by an vpandn later on:
vpandnq %zmm13, %zmm13, %zmm14{%k1}
so maybe:
vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z}
instead of the vpandn.
vmovups __dAbsMask(%rax), %zmm7
> vmovups __dInvPI(%rax), %zmm2
> vmovups __dRShifter(%rax), %zmm1
> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
> vmovsd %xmm0, 1216(%rsp,%r15)
> jmp .LBL_2_7
> END (_ZGVeN8v_sin_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.14:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.14,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> index 965415f2bd..4ad366373b 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
>
> /* SinPoly = SinR*SinPoly */
> vfmadd213pd %zmm5, %zmm5, %zmm4
> - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>
Also vpandn below:
vpandnq %zmm7, %zmm7, %zmm3{%k1}
> /* Update Cos result's sign */
> vxorpd %zmm2, %zmm1, %zmm1
> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
> ENTRY (_ZGVeN8vvv_sincos_skx)
> WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
> END (_ZGVeN8vvv_sincos_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.15:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.15,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> index cdcb16087d..b7d79efb54 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
> X = X - Y*PI1 - Y*PI2 - Y*PI3
> */
> vmovaps %zmm0, %zmm6
> - vmovups .L_2il0floatpacket.13(%rip), %zmm12
> + vpternlogd $0xff, %zmm12, %zmm12, %zmm12
>
Also vpandn below:
vpandnd %zmm1, %zmm1, %zmm12{%k1}
> vmovups __sRShifter(%rax), %zmm3
> vmovups __sPI1_FMA(%rax), %zmm5
> vmovups __sA9_FMA(%rax), %zmm9
> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
> vmovss %xmm0, 1216(%rsp,%r15,8)
> jmp .LBL_2_7
> END (_ZGVeN16v_cosf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.13:
> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> index 1b09909344..9f03b9b780 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
> vmovaps %zmm0, %zmm7
>
> /* compare against threshold */
> - vmovups .L_2il0floatpacket.13(%rip), %zmm3
> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>
Also below:
vpandnd %zmm2, %zmm2, %zmm3{%k1}
> vmovups __sInvLn2(%rax), %zmm4
> vmovups __sShifter(%rax), %zmm1
> vmovups __sLn2hi(%rax), %zmm6
> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
> jmp .LBL_2_7
>
> END (_ZGVeN16v_expf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.13:
> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> index 4a7b2adbbf..2ba38b0f33 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
> andq $-64, %rsp
> subq $1280, %rsp
> movq __svml_slog_data@GOTPCREL(%rip), %rax
> - vmovups .L_2il0floatpacket.7(%rip), %zmm6
> + vpternlogd $0xff, %zmm6, %zmm6, %zmm6
>
Also below:
vpandnd %zmm1, %zmm1, %zmm6{%k1}
> vmovups _iBrkValue(%rax), %zmm4
> vmovups _sPoly_7(%rax), %zmm8
>
> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
> jmp .LBL_2_7
>
> END (_ZGVeN16v_logf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.7:
> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.7,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> index 7f906622a5..7f0272c809 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> vpsrlq $32, %zmm3, %zmm2
> vpmovqd %zmm2, %ymm11
> vcvtps2pd %ymm14, %zmm13
> - vmovups .L_2il0floatpacket.23(%rip), %zmm14
> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
earlier
kxnorw %k3, %k3, %k3
can be used to get a dependency break with this:
vpmovm2d %k3, %zmm14
> vmovaps %zmm14, %zmm26
> vpandd _ABSMASK(%rax), %zmm1, %zmm8
> vpcmpd $1, _INF(%rax), %zmm8, %k2
> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> vpmovqd %zmm11, %ymm5
> vpxord %zmm10, %zmm10, %zmm10
> vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
> - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
> + vpternlogd $0xff, %zmm4, %zmm4, %zmm4
> vpxord %zmm11, %zmm11, %zmm11
> vcvtdq2pd %ymm7, %zmm7
> vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
> vmovss %xmm0, 1216(%rsp,%r15,8)
> jmp .LBL_2_7
> END (_ZGVeN16vv_powf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.23:
> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.23,@object
> -.L_2il0floatpacket.24:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.24,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> index 54cee3a537..e1d0154441 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
>
> /* Result sign calculations */
> vpternlogd $150, %zmm0, %zmm14, %zmm1
> - vmovups .L_2il0floatpacket.13(%rip), %zmm14
> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
> /* Add correction term 0.5 for cos() part */
> vaddps %zmm8, %zmm5, %zmm15
> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
> ENTRY (_ZGVeN16vvv_sincosf_skx)
> WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
> END (_ZGVeN16vvv_sincosf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.13:
> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> index ec65ffdce5..bcb76ff756 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
> movq __svml_s_trig_data@GOTPCREL(%rip), %rax
>
> /* Check for large and special values */
> - vmovups .L_2il0floatpacket.11(%rip), %zmm14
> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
Also below:
vpandnd %zmm2, %zmm2, %zmm14{%k1}
> vmovups __sAbsMask(%rax), %zmm5
> vmovups __sInvPI(%rax), %zmm1
> vmovups __sRShifter(%rax), %zmm2
> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
> vmovss %xmm0, 1216(%rsp,%r15,8)
> jmp .LBL_2_7
> END (_ZGVeN16v_sinf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.11:
> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.11,@object
> --
> 2.31.1
>
>
On Sat, Aug 21, 2021 at 10:49 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
>
>
> On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote:
>>
>> Optimize loads of all bits set into ZMM register in AVX512 SVML codes
>> by replacing
>>
>> vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
>>
>> and
>>
>> vmovups .L_2il0floatpacket.13(%rip), %zmmX
>>
>> with
>> vpternlogd $0xff, %zmmX, %zmmX, %zmmX
>>
>> This fixes BZ #28252.
>> ---
>> .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
>> .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
>> .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
>> .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
>> .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
>> .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
>> .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
>> .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
>> .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
>> .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
>> 10 files changed, 11 insertions(+), 64 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
>> index c2cf007904..0fcb912557 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
>> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
>> vmovaps %zmm0, %zmm8
>>
>> /* Check for large arguments path */
>> - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
>> + vpternlogd $0xff, %zmm2, %zmm2, %zmm2
>
> Looking at the code it seems like this is used later by
>
> vpandnq %zmm1, %zmm1, %zmm2{%k1}
>
> AFAICT you can make the vpternlogd down there and just use
>
> vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z}
>>
>>
>> /*
>> ARGUMENT RANGE REDUCTION:
>> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
>> vmovsd %xmm0, 1216(%rsp,%r15)
>> jmp .LBL_2_7
>> END (_ZGVeN8v_cos_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.16:
>> - .long 0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.16,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
>> index e9a5d00992..5596c950ce 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
>> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
>>
>> /* preserve mantissa, set input exponent to 2^(-10) */
>> vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
>> - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
>> + vpternlogd $0xff, %zmm1, %zmm1, %zmm1
>
>
> Earlier in the function there is a dependency breaking
>
> kxnorw %k3, %k3, %k3
>
> so I think you can accomplish the same thing but breaking
> some unlucky dep chain with:
>
> vpmovm2d %k3, %zmm2
>
>> vpsrlq $32, %zmm4, %zmm6
>>
>> /* reciprocal approximation good to at least 11 bits */
>> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
>> vmovsd %xmm0, 1216(%rsp,%r15)
>> jmp .LBL_2_7
>> END (_ZGVeN8v_log_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.12:
>> - .long 0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.12,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
>> index 508da563fe..2981f1582e 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
>> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
>> andq $-64, %rsp
>> subq $1280, %rsp
>> movq __svml_d_trig_data@GOTPCREL(%rip), %rax
>> - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
>> + vpternlogd $0xff, %zmm1, %zmm1, %zmm14
>
> This one also seems to just be used by an vpandn later on:
>
> vpandnq %zmm13, %zmm13, %zmm14{%k1}
>
> so maybe:
> vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z}
> instead of the vpandn.
>
>> vmovups __dAbsMask(%rax), %zmm7
>> vmovups __dInvPI(%rax), %zmm2
>> vmovups __dRShifter(%rax), %zmm1
>> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
>> vmovsd %xmm0, 1216(%rsp,%r15)
>> jmp .LBL_2_7
>> END (_ZGVeN8v_sin_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.14:
>> - .long 0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.14,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
>> index 965415f2bd..4ad366373b 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
>> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
>>
>> /* SinPoly = SinR*SinPoly */
>> vfmadd213pd %zmm5, %zmm5, %zmm4
>> - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
>> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>
>
> Also vpandn below:
> vpandnq %zmm7, %zmm7, %zmm3{%k1}
>
>>
>> /* Update Cos result's sign */
>> vxorpd %zmm2, %zmm1, %zmm1
>> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
>> ENTRY (_ZGVeN8vvv_sincos_skx)
>> WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
>> END (_ZGVeN8vvv_sincos_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.15:
>> - .long 0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.15,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
>> index cdcb16087d..b7d79efb54 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
>> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
>> X = X - Y*PI1 - Y*PI2 - Y*PI3
>> */
>> vmovaps %zmm0, %zmm6
>> - vmovups .L_2il0floatpacket.13(%rip), %zmm12
>> + vpternlogd $0xff, %zmm12, %zmm12, %zmm12
>
>
> Also vpandn below:
> vpandnd %zmm1, %zmm1, %zmm12{%k1}
>
>>
>> vmovups __sRShifter(%rax), %zmm3
>> vmovups __sPI1_FMA(%rax), %zmm5
>> vmovups __sA9_FMA(%rax), %zmm9
>> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
>> vmovss %xmm0, 1216(%rsp,%r15,8)
>> jmp .LBL_2_7
>> END (_ZGVeN16v_cosf_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.13:
>> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.13,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
>> index 1b09909344..9f03b9b780 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
>> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
>> vmovaps %zmm0, %zmm7
>>
>> /* compare against threshold */
>> - vmovups .L_2il0floatpacket.13(%rip), %zmm3
>> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>
>
> Also below:
> vpandnd %zmm2, %zmm2, %zmm3{%k1}
>>
>> vmovups __sInvLn2(%rax), %zmm4
>> vmovups __sShifter(%rax), %zmm1
>> vmovups __sLn2hi(%rax), %zmm6
>> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
>> jmp .LBL_2_7
>>
>> END (_ZGVeN16v_expf_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.13:
>> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.13,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
>> index 4a7b2adbbf..2ba38b0f33 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
>> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
>> andq $-64, %rsp
>> subq $1280, %rsp
>> movq __svml_slog_data@GOTPCREL(%rip), %rax
>> - vmovups .L_2il0floatpacket.7(%rip), %zmm6
>> + vpternlogd $0xff, %zmm6, %zmm6, %zmm6
>
> Also below:
> vpandnd %zmm1, %zmm1, %zmm6{%k1}
>
>>
>> vmovups _iBrkValue(%rax), %zmm4
>> vmovups _sPoly_7(%rax), %zmm8
>>
>> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
>> jmp .LBL_2_7
>>
>> END (_ZGVeN16v_logf_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.7:
>> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.7,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
>> index 7f906622a5..7f0272c809 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
>> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
>> vpsrlq $32, %zmm3, %zmm2
>> vpmovqd %zmm2, %ymm11
>> vcvtps2pd %ymm14, %zmm13
>> - vmovups .L_2il0floatpacket.23(%rip), %zmm14
>> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
> earlier
> kxnorw %k3, %k3, %k3
> can be used to get a dependency break with this:
> vpmovm2d %k3, %zmm14
The SVM codes can use some improvements. Can you
open a separate glibc bug? I'd like to address only all 1s
load here to avoid more complexity.
Thanks.
>> vmovaps %zmm14, %zmm26
>> vpandd _ABSMASK(%rax), %zmm1, %zmm8
>> vpcmpd $1, _INF(%rax), %zmm8, %k2
>> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
>> vpmovqd %zmm11, %ymm5
>> vpxord %zmm10, %zmm10, %zmm10
>> vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
>> - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
>> + vpternlogd $0xff, %zmm4, %zmm4, %zmm4
>> vpxord %zmm11, %zmm11, %zmm11
>> vcvtdq2pd %ymm7, %zmm7
>> vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
>> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
>> vmovss %xmm0, 1216(%rsp,%r15,8)
>> jmp .LBL_2_7
>> END (_ZGVeN16vv_powf_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.23:
>> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.23,@object
>> -.L_2il0floatpacket.24:
>> - .long 0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.24,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
>> index 54cee3a537..e1d0154441 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
>> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
>>
>> /* Result sign calculations */
>> vpternlogd $150, %zmm0, %zmm14, %zmm1
>> - vmovups .L_2il0floatpacket.13(%rip), %zmm14
>> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>>
>> /* Add correction term 0.5 for cos() part */
>> vaddps %zmm8, %zmm5, %zmm15
>> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
>> ENTRY (_ZGVeN16vvv_sincosf_skx)
>> WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
>> END (_ZGVeN16vvv_sincosf_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.13:
>> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.13,@object
>> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
>> index ec65ffdce5..bcb76ff756 100644
>> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
>> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
>> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
>> movq __svml_s_trig_data@GOTPCREL(%rip), %rax
>>
>> /* Check for large and special values */
>> - vmovups .L_2il0floatpacket.11(%rip), %zmm14
>> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
> Also below:
> vpandnd %zmm2, %zmm2, %zmm14{%k1}
>>
>> vmovups __sAbsMask(%rax), %zmm5
>> vmovups __sInvPI(%rax), %zmm1
>> vmovups __sRShifter(%rax), %zmm2
>> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
>> vmovss %xmm0, 1216(%rsp,%r15,8)
>> jmp .LBL_2_7
>> END (_ZGVeN16v_sinf_skx)
>> -
>> - .section .rodata, "a"
>> -.L_2il0floatpacket.11:
>> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
>> - .type .L_2il0floatpacket.11,@object
>> --
>> 2.31.1
>>
On Sat, Aug 21, 2021 at 2:09 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Aug 21, 2021 at 10:49 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >
> >
> >
> > On Sat, Aug 21, 2021 at 12:36 PM H.J. Lu via Libc-alpha <
> libc-alpha@sourceware.org> wrote:
> >>
> >> Optimize loads of all bits set into ZMM register in AVX512 SVML codes
> >> by replacing
> >>
> >> vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
> >>
> >> and
> >>
> >> vmovups .L_2il0floatpacket.13(%rip), %zmmX
> >>
> >> with
> >> vpternlogd $0xff, %zmmX, %zmmX, %zmmX
> >>
> >> This fixes BZ #28252.
> >> ---
> >> .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
> >> .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
> >> .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
> >> .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
> >> .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
> >> .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
> >> .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
> >> .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
> >> .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
> >> .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
> >> 10 files changed, 11 insertions(+), 64 deletions(-)
> >>
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> >> index c2cf007904..0fcb912557 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> >> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
> >> vmovaps %zmm0, %zmm8
> >>
> >> /* Check for large arguments path */
> >> - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
> >> + vpternlogd $0xff, %zmm2, %zmm2, %zmm2
> >
> > Looking at the code it seems like this is used later by
> >
> > vpandnq %zmm1, %zmm1, %zmm2{%k1}
> >
> > AFAICT you can make the vpternlogd down there and just use
> >
> > vpternlogq $0xff, %zmm1, %zmm1, %zmm2{%k1}{z}
> >>
> >>
> >> /*
> >> ARGUMENT RANGE REDUCTION:
> >> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
> >> vmovsd %xmm0, 1216(%rsp,%r15)
> >> jmp .LBL_2_7
> >> END (_ZGVeN8v_cos_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.16:
> >> - .long 0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.16,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> >> index e9a5d00992..5596c950ce 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> >> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
> >>
> >> /* preserve mantissa, set input exponent to 2^(-10) */
> >> vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
> >> - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
> >> + vpternlogd $0xff, %zmm1, %zmm1, %zmm1
> >
> >
> > Earlier in the function there is a dependency breaking
> >
> > kxnorw %k3, %k3, %k3
> >
> > so I think you can accomplish the same thing but breaking
> > some unlucky dep chain with:
> >
> > vpmovm2d %k3, %zmm2
> >
> >> vpsrlq $32, %zmm4, %zmm6
> >>
> >> /* reciprocal approximation good to at least 11 bits */
> >> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
> >> vmovsd %xmm0, 1216(%rsp,%r15)
> >> jmp .LBL_2_7
> >> END (_ZGVeN8v_log_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.12:
> >> - .long 0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.12,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> >> index 508da563fe..2981f1582e 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> >> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
> >> andq $-64, %rsp
> >> subq $1280, %rsp
> >> movq __svml_d_trig_data@GOTPCREL(%rip), %rax
> >> - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
> >> + vpternlogd $0xff, %zmm1, %zmm1, %zmm14
> >
> > This one also seems to just be used by an vpandn later on:
> >
> > vpandnq %zmm13, %zmm13, %zmm14{%k1}
> >
> > so maybe:
> > vpternlogq $0xff, %zmm13, %zmm13, %zmm14{%k1}{z}
> > instead of the vpandn.
> >
> >> vmovups __dAbsMask(%rax), %zmm7
> >> vmovups __dInvPI(%rax), %zmm2
> >> vmovups __dRShifter(%rax), %zmm1
> >> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
> >> vmovsd %xmm0, 1216(%rsp,%r15)
> >> jmp .LBL_2_7
> >> END (_ZGVeN8v_sin_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.14:
> >> - .long 0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.14,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> >> index 965415f2bd..4ad366373b 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> >> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
> >>
> >> /* SinPoly = SinR*SinPoly */
> >> vfmadd213pd %zmm5, %zmm5, %zmm4
> >> - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
> >> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> >
> >
> > Also vpandn below:
> > vpandnq %zmm7, %zmm7, %zmm3{%k1}
> >
> >>
> >> /* Update Cos result's sign */
> >> vxorpd %zmm2, %zmm1, %zmm1
> >> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
> >> ENTRY (_ZGVeN8vvv_sincos_skx)
> >> WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
> >> END (_ZGVeN8vvv_sincos_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.15:
> >> - .long 0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.15,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> >> index cdcb16087d..b7d79efb54 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> >> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
> >> X = X - Y*PI1 - Y*PI2 - Y*PI3
> >> */
> >> vmovaps %zmm0, %zmm6
> >> - vmovups .L_2il0floatpacket.13(%rip), %zmm12
> >> + vpternlogd $0xff, %zmm12, %zmm12, %zmm12
> >
> >
> > Also vpandn below:
> > vpandnd %zmm1, %zmm1, %zmm12{%k1}
> >
> >>
> >> vmovups __sRShifter(%rax), %zmm3
> >> vmovups __sPI1_FMA(%rax), %zmm5
> >> vmovups __sA9_FMA(%rax), %zmm9
> >> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
> >> vmovss %xmm0, 1216(%rsp,%r15,8)
> >> jmp .LBL_2_7
> >> END (_ZGVeN16v_cosf_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.13:
> >> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.13,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> >> index 1b09909344..9f03b9b780 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> >> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
> >> vmovaps %zmm0, %zmm7
> >>
> >> /* compare against threshold */
> >> - vmovups .L_2il0floatpacket.13(%rip), %zmm3
> >> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> >
> >
> > Also below:
> > vpandnd %zmm2, %zmm2, %zmm3{%k1}
> >>
> >> vmovups __sInvLn2(%rax), %zmm4
> >> vmovups __sShifter(%rax), %zmm1
> >> vmovups __sLn2hi(%rax), %zmm6
> >> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
> >> jmp .LBL_2_7
> >>
> >> END (_ZGVeN16v_expf_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.13:
> >> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.13,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> >> index 4a7b2adbbf..2ba38b0f33 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> >> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
> >> andq $-64, %rsp
> >> subq $1280, %rsp
> >> movq __svml_slog_data@GOTPCREL(%rip), %rax
> >> - vmovups .L_2il0floatpacket.7(%rip), %zmm6
> >> + vpternlogd $0xff, %zmm6, %zmm6, %zmm6
> >
> > Also below:
> > vpandnd %zmm1, %zmm1, %zmm6{%k1}
> >
> >>
> >> vmovups _iBrkValue(%rax), %zmm4
> >> vmovups _sPoly_7(%rax), %zmm8
> >>
> >> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
> >> jmp .LBL_2_7
> >>
> >> END (_ZGVeN16v_logf_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.7:
> >> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.7,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> >> index 7f906622a5..7f0272c809 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> >> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >> vpsrlq $32, %zmm3, %zmm2
> >> vpmovqd %zmm2, %ymm11
> >> vcvtps2pd %ymm14, %zmm13
> >> - vmovups .L_2il0floatpacket.23(%rip), %zmm14
> >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >
> > earlier
> > kxnorw %k3, %k3, %k3
> > can be used to get a dependency break with this:
> > vpmovm2d %k3, %zmm14
>
> The SVM codes can use some improvements. Can you
> open a separate glibc bug? I'd like to address only all 1s
> load here to avoid more complexity.
>
> Thanks.
>
Alright.
Okay with this patch.
>
> >> vmovaps %zmm14, %zmm26
> >> vpandd _ABSMASK(%rax), %zmm1, %zmm8
> >> vpcmpd $1, _INF(%rax), %zmm8, %k2
> >> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >> vpmovqd %zmm11, %ymm5
> >> vpxord %zmm10, %zmm10, %zmm10
> >> vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
> >> - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
> >> + vpternlogd $0xff, %zmm4, %zmm4, %zmm4
> >> vpxord %zmm11, %zmm11, %zmm11
> >> vcvtdq2pd %ymm7, %zmm7
> >> vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
> >> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
> >> vmovss %xmm0, 1216(%rsp,%r15,8)
> >> jmp .LBL_2_7
> >> END (_ZGVeN16vv_powf_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.23:
> >> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.23,@object
> >> -.L_2il0floatpacket.24:
> >> - .long 0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.24,@object
> >> diff --git
> a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> >> index 54cee3a537..e1d0154441 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> >> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
> >>
> >> /* Result sign calculations */
> >> vpternlogd $150, %zmm0, %zmm14, %zmm1
> >> - vmovups .L_2il0floatpacket.13(%rip), %zmm14
> >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >>
> >> /* Add correction term 0.5 for cos() part */
> >> vaddps %zmm8, %zmm5, %zmm15
> >> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
> >> ENTRY (_ZGVeN16vvv_sincosf_skx)
> >> WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
> >> END (_ZGVeN16vvv_sincosf_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.13:
> >> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.13,@object
> >> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> >> index ec65ffdce5..bcb76ff756 100644
> >> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> >> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> >> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
> >> movq __svml_s_trig_data@GOTPCREL(%rip), %rax
> >>
> >> /* Check for large and special values */
> >> - vmovups .L_2il0floatpacket.11(%rip), %zmm14
> >> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >
> > Also below:
> > vpandnd %zmm2, %zmm2, %zmm14{%k1}
> >>
> >> vmovups __sAbsMask(%rax), %zmm5
> >> vmovups __sInvPI(%rax), %zmm1
> >> vmovups __sRShifter(%rax), %zmm2
> >> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
> >> vmovss %xmm0, 1216(%rsp,%r15,8)
> >> jmp .LBL_2_7
> >> END (_ZGVeN16v_sinf_skx)
> >> -
> >> - .section .rodata, "a"
> >> -.L_2il0floatpacket.11:
> >> - .long
> 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> >> - .type .L_2il0floatpacket.11,@object
> >> --
> >> 2.31.1
> >>
>
>
> --
> H.J.
>
On Sat, Aug 21, 2021 at 9:37 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Optimize loads of all bits set into ZMM register in AVX512 SVML codes
> by replacing
>
> vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
>
> and
>
> vmovups .L_2il0floatpacket.13(%rip), %zmmX
>
> with
> vpternlogd $0xff, %zmmX, %zmmX, %zmmX
>
> This fixes BZ #28252.
> ---
> .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
> .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
> .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
> .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
> 10 files changed, 11 insertions(+), 64 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> index c2cf007904..0fcb912557 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
> vmovaps %zmm0, %zmm8
>
> /* Check for large arguments path */
> - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
> + vpternlogd $0xff, %zmm2, %zmm2, %zmm2
>
> /*
> ARGUMENT RANGE REDUCTION:
> @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
> vmovsd %xmm0, 1216(%rsp,%r15)
> jmp .LBL_2_7
> END (_ZGVeN8v_cos_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.16:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.16,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> index e9a5d00992..5596c950ce 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
>
> /* preserve mantissa, set input exponent to 2^(-10) */
> vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
> - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
> + vpternlogd $0xff, %zmm1, %zmm1, %zmm1
> vpsrlq $32, %zmm4, %zmm6
>
> /* reciprocal approximation good to at least 11 bits */
> @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
> vmovsd %xmm0, 1216(%rsp,%r15)
> jmp .LBL_2_7
> END (_ZGVeN8v_log_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.12:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.12,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> index 508da563fe..2981f1582e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
> andq $-64, %rsp
> subq $1280, %rsp
> movq __svml_d_trig_data@GOTPCREL(%rip), %rax
> - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
> + vpternlogd $0xff, %zmm1, %zmm1, %zmm14
> vmovups __dAbsMask(%rax), %zmm7
> vmovups __dInvPI(%rax), %zmm2
> vmovups __dRShifter(%rax), %zmm1
> @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
> vmovsd %xmm0, 1216(%rsp,%r15)
> jmp .LBL_2_7
> END (_ZGVeN8v_sin_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.14:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.14,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> index 965415f2bd..4ad366373b 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
>
> /* SinPoly = SinR*SinPoly */
> vfmadd213pd %zmm5, %zmm5, %zmm4
> - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
>
> /* Update Cos result's sign */
> vxorpd %zmm2, %zmm1, %zmm1
> @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
> ENTRY (_ZGVeN8vvv_sincos_skx)
> WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
> END (_ZGVeN8vvv_sincos_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.15:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.15,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> index cdcb16087d..b7d79efb54 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
> X = X - Y*PI1 - Y*PI2 - Y*PI3
> */
> vmovaps %zmm0, %zmm6
> - vmovups .L_2il0floatpacket.13(%rip), %zmm12
> + vpternlogd $0xff, %zmm12, %zmm12, %zmm12
> vmovups __sRShifter(%rax), %zmm3
> vmovups __sPI1_FMA(%rax), %zmm5
> vmovups __sA9_FMA(%rax), %zmm9
> @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
> vmovss %xmm0, 1216(%rsp,%r15,8)
> jmp .LBL_2_7
> END (_ZGVeN16v_cosf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.13:
> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> index 1b09909344..9f03b9b780 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
> vmovaps %zmm0, %zmm7
>
> /* compare against threshold */
> - vmovups .L_2il0floatpacket.13(%rip), %zmm3
> + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> vmovups __sInvLn2(%rax), %zmm4
> vmovups __sShifter(%rax), %zmm1
> vmovups __sLn2hi(%rax), %zmm6
> @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
> jmp .LBL_2_7
>
> END (_ZGVeN16v_expf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.13:
> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> index 4a7b2adbbf..2ba38b0f33 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
> andq $-64, %rsp
> subq $1280, %rsp
> movq __svml_slog_data@GOTPCREL(%rip), %rax
> - vmovups .L_2il0floatpacket.7(%rip), %zmm6
> + vpternlogd $0xff, %zmm6, %zmm6, %zmm6
> vmovups _iBrkValue(%rax), %zmm4
> vmovups _sPoly_7(%rax), %zmm8
>
> @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
> jmp .LBL_2_7
>
> END (_ZGVeN16v_logf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.7:
> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.7,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> index 7f906622a5..7f0272c809 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> vpsrlq $32, %zmm3, %zmm2
> vpmovqd %zmm2, %ymm11
> vcvtps2pd %ymm14, %zmm13
> - vmovups .L_2il0floatpacket.23(%rip), %zmm14
> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> vmovaps %zmm14, %zmm26
> vpandd _ABSMASK(%rax), %zmm1, %zmm8
> vpcmpd $1, _INF(%rax), %zmm8, %k2
> @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> vpmovqd %zmm11, %ymm5
> vpxord %zmm10, %zmm10, %zmm10
> vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
> - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
> + vpternlogd $0xff, %zmm4, %zmm4, %zmm4
> vpxord %zmm11, %zmm11, %zmm11
> vcvtdq2pd %ymm7, %zmm7
> vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
> @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
> vmovss %xmm0, 1216(%rsp,%r15,8)
> jmp .LBL_2_7
> END (_ZGVeN16vv_powf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.23:
> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.23,@object
> -.L_2il0floatpacket.24:
> - .long 0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.24,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> index 54cee3a537..e1d0154441 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
>
> /* Result sign calculations */
> vpternlogd $150, %zmm0, %zmm14, %zmm1
> - vmovups .L_2il0floatpacket.13(%rip), %zmm14
> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
>
> /* Add correction term 0.5 for cos() part */
> vaddps %zmm8, %zmm5, %zmm15
> @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
> ENTRY (_ZGVeN16vvv_sincosf_skx)
> WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
> END (_ZGVeN16vvv_sincosf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.13:
> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.13,@object
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> index ec65ffdce5..bcb76ff756 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
> movq __svml_s_trig_data@GOTPCREL(%rip), %rax
>
> /* Check for large and special values */
> - vmovups .L_2il0floatpacket.11(%rip), %zmm14
> + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> vmovups __sAbsMask(%rax), %zmm5
> vmovups __sInvPI(%rax), %zmm1
> vmovups __sRShifter(%rax), %zmm2
> @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
> vmovss %xmm0, 1216(%rsp,%r15,8)
> jmp .LBL_2_7
> END (_ZGVeN16v_sinf_skx)
> -
> - .section .rodata, "a"
> -.L_2il0floatpacket.11:
> - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> - .type .L_2il0floatpacket.11,@object
> --
> 2.31.1
>
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
On Fri, Apr 22, 2022 at 4:40 PM Sunil Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sat, Aug 21, 2021 at 9:37 AM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Optimize loads of all bits set into ZMM register in AVX512 SVML codes
> > by replacing
> >
> > vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
> >
> > and
> >
> > vmovups .L_2il0floatpacket.13(%rip), %zmmX
> >
> > with
> > vpternlogd $0xff, %zmmX, %zmmX, %zmmX
> >
> > This fixes BZ #28252.
> > ---
> > .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
> > .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
> > .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
> > .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
> > .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
> > .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
> > .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
> > .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
> > .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
> > .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
> > 10 files changed, 11 insertions(+), 64 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> > index c2cf007904..0fcb912557 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
> > @@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
> > vmovaps %zmm0, %zmm8
> >
> > /* Check for large arguments path */
> > - vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
> > + vpternlogd $0xff, %zmm2, %zmm2, %zmm2
> >
> > /*
> > ARGUMENT RANGE REDUCTION:
> > @@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
> > vmovsd %xmm0, 1216(%rsp,%r15)
> > jmp .LBL_2_7
> > END (_ZGVeN8v_cos_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.16:
> > - .long 0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.16,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> > index e9a5d00992..5596c950ce 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
> > @@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
> >
> > /* preserve mantissa, set input exponent to 2^(-10) */
> > vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
> > - vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
> > + vpternlogd $0xff, %zmm1, %zmm1, %zmm1
> > vpsrlq $32, %zmm4, %zmm6
> >
> > /* reciprocal approximation good to at least 11 bits */
> > @@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
> > vmovsd %xmm0, 1216(%rsp,%r15)
> > jmp .LBL_2_7
> > END (_ZGVeN8v_log_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.12:
> > - .long 0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.12,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> > index 508da563fe..2981f1582e 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
> > @@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
> > andq $-64, %rsp
> > subq $1280, %rsp
> > movq __svml_d_trig_data@GOTPCREL(%rip), %rax
> > - vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
> > + vpternlogd $0xff, %zmm1, %zmm1, %zmm14
> > vmovups __dAbsMask(%rax), %zmm7
> > vmovups __dInvPI(%rax), %zmm2
> > vmovups __dRShifter(%rax), %zmm1
> > @@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
> > vmovsd %xmm0, 1216(%rsp,%r15)
> > jmp .LBL_2_7
> > END (_ZGVeN8v_sin_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.14:
> > - .long 0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.14,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> > index 965415f2bd..4ad366373b 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
> > @@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
> >
> > /* SinPoly = SinR*SinPoly */
> > vfmadd213pd %zmm5, %zmm5, %zmm4
> > - vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
> > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> >
> > /* Update Cos result's sign */
> > vxorpd %zmm2, %zmm1, %zmm1
> > @@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
> > ENTRY (_ZGVeN8vvv_sincos_skx)
> > WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
> > END (_ZGVeN8vvv_sincos_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.15:
> > - .long 0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.15,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> > index cdcb16087d..b7d79efb54 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
> > @@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
> > X = X - Y*PI1 - Y*PI2 - Y*PI3
> > */
> > vmovaps %zmm0, %zmm6
> > - vmovups .L_2il0floatpacket.13(%rip), %zmm12
> > + vpternlogd $0xff, %zmm12, %zmm12, %zmm12
> > vmovups __sRShifter(%rax), %zmm3
> > vmovups __sPI1_FMA(%rax), %zmm5
> > vmovups __sA9_FMA(%rax), %zmm9
> > @@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
> > vmovss %xmm0, 1216(%rsp,%r15,8)
> > jmp .LBL_2_7
> > END (_ZGVeN16v_cosf_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.13:
> > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.13,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> > index 1b09909344..9f03b9b780 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
> > @@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
> > vmovaps %zmm0, %zmm7
> >
> > /* compare against threshold */
> > - vmovups .L_2il0floatpacket.13(%rip), %zmm3
> > + vpternlogd $0xff, %zmm3, %zmm3, %zmm3
> > vmovups __sInvLn2(%rax), %zmm4
> > vmovups __sShifter(%rax), %zmm1
> > vmovups __sLn2hi(%rax), %zmm6
> > @@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
> > jmp .LBL_2_7
> >
> > END (_ZGVeN16v_expf_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.13:
> > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.13,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> > index 4a7b2adbbf..2ba38b0f33 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
> > @@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
> > andq $-64, %rsp
> > subq $1280, %rsp
> > movq __svml_slog_data@GOTPCREL(%rip), %rax
> > - vmovups .L_2il0floatpacket.7(%rip), %zmm6
> > + vpternlogd $0xff, %zmm6, %zmm6, %zmm6
> > vmovups _iBrkValue(%rax), %zmm4
> > vmovups _sPoly_7(%rax), %zmm8
> >
> > @@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
> > jmp .LBL_2_7
> >
> > END (_ZGVeN16v_logf_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.7:
> > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.7,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> > index 7f906622a5..7f0272c809 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
> > @@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> > vpsrlq $32, %zmm3, %zmm2
> > vpmovqd %zmm2, %ymm11
> > vcvtps2pd %ymm14, %zmm13
> > - vmovups .L_2il0floatpacket.23(%rip), %zmm14
> > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> > vmovaps %zmm14, %zmm26
> > vpandd _ABSMASK(%rax), %zmm1, %zmm8
> > vpcmpd $1, _INF(%rax), %zmm8, %k2
> > @@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
> > vpmovqd %zmm11, %ymm5
> > vpxord %zmm10, %zmm10, %zmm10
> > vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
> > - vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
> > + vpternlogd $0xff, %zmm4, %zmm4, %zmm4
> > vpxord %zmm11, %zmm11, %zmm11
> > vcvtdq2pd %ymm7, %zmm7
> > vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
> > @@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
> > vmovss %xmm0, 1216(%rsp,%r15,8)
> > jmp .LBL_2_7
> > END (_ZGVeN16vv_powf_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.23:
> > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.23,@object
> > -.L_2il0floatpacket.24:
> > - .long 0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.24,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> > index 54cee3a537..e1d0154441 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
> > @@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
> >
> > /* Result sign calculations */
> > vpternlogd $150, %zmm0, %zmm14, %zmm1
> > - vmovups .L_2il0floatpacket.13(%rip), %zmm14
> > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> >
> > /* Add correction term 0.5 for cos() part */
> > vaddps %zmm8, %zmm5, %zmm15
> > @@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
> > ENTRY (_ZGVeN16vvv_sincosf_skx)
> > WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
> > END (_ZGVeN16vvv_sincosf_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.13:
> > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.13,@object
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> > index ec65ffdce5..bcb76ff756 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
> > @@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
> > movq __svml_s_trig_data@GOTPCREL(%rip), %rax
> >
> > /* Check for large and special values */
> > - vmovups .L_2il0floatpacket.11(%rip), %zmm14
> > + vpternlogd $0xff, %zmm14, %zmm14, %zmm14
> > vmovups __sAbsMask(%rax), %zmm5
> > vmovups __sInvPI(%rax), %zmm1
> > vmovups __sRShifter(%rax), %zmm2
> > @@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
> > vmovss %xmm0, 1216(%rsp,%r15,8)
> > jmp .LBL_2_7
> > END (_ZGVeN16v_sinf_skx)
> > -
> > - .section .rodata, "a"
> > -.L_2il0floatpacket.11:
> > - .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
> > - .type .L_2il0floatpacket.11,@object
> > --
> > 2.31.1
> >
>
> I would like to backport this patch to release branches.
> Any comments or objections?
None by me
>
> --Sunil
@@ -258,7 +258,7 @@ ENTRY (_ZGVeN8v_cos_skx)
vmovaps %zmm0, %zmm8
/* Check for large arguments path */
- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+ vpternlogd $0xff, %zmm2, %zmm2, %zmm2
/*
ARGUMENT RANGE REDUCTION:
@@ -448,8 +448,3 @@ ENTRY (_ZGVeN8v_cos_skx)
vmovsd %xmm0, 1216(%rsp,%r15)
jmp .LBL_2_7
END (_ZGVeN8v_cos_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.16:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.16,@object
@@ -267,7 +267,7 @@ ENTRY (_ZGVeN8v_log_skx)
/* preserve mantissa, set input exponent to 2^(-10) */
vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm1
vpsrlq $32, %zmm4, %zmm6
/* reciprocal approximation good to at least 11 bits */
@@ -453,8 +453,3 @@ ENTRY (_ZGVeN8v_log_skx)
vmovsd %xmm0, 1216(%rsp,%r15)
jmp .LBL_2_7
END (_ZGVeN8v_log_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.12:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.12,@object
@@ -254,7 +254,7 @@ ENTRY (_ZGVeN8v_sin_skx)
andq $-64, %rsp
subq $1280, %rsp
movq __svml_d_trig_data@GOTPCREL(%rip), %rax
- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm14
vmovups __dAbsMask(%rax), %zmm7
vmovups __dInvPI(%rax), %zmm2
vmovups __dRShifter(%rax), %zmm1
@@ -450,8 +450,3 @@ ENTRY (_ZGVeN8v_sin_skx)
vmovsd %xmm0, 1216(%rsp,%r15)
jmp .LBL_2_7
END (_ZGVeN8v_sin_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.14:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.14,@object
@@ -423,7 +423,7 @@ ENTRY (_ZGVeN8vl8l8_sincos_skx)
/* SinPoly = SinR*SinPoly */
vfmadd213pd %zmm5, %zmm5, %zmm4
- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
/* Update Cos result's sign */
vxorpd %zmm2, %zmm1, %zmm1
@@ -733,8 +733,3 @@ END (_ZGVeN8vvv_sincos_knl)
ENTRY (_ZGVeN8vvv_sincos_skx)
WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
END (_ZGVeN8vvv_sincos_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.15:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.15,@object
@@ -271,7 +271,7 @@ ENTRY (_ZGVeN16v_cosf_skx)
X = X - Y*PI1 - Y*PI2 - Y*PI3
*/
vmovaps %zmm0, %zmm6
- vmovups .L_2il0floatpacket.13(%rip), %zmm12
+ vpternlogd $0xff, %zmm12, %zmm12, %zmm12
vmovups __sRShifter(%rax), %zmm3
vmovups __sPI1_FMA(%rax), %zmm5
vmovups __sA9_FMA(%rax), %zmm9
@@ -445,8 +445,3 @@ ENTRY (_ZGVeN16v_cosf_skx)
vmovss %xmm0, 1216(%rsp,%r15,8)
jmp .LBL_2_7
END (_ZGVeN16v_cosf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
@@ -257,7 +257,7 @@ ENTRY (_ZGVeN16v_expf_skx)
vmovaps %zmm0, %zmm7
/* compare against threshold */
- vmovups .L_2il0floatpacket.13(%rip), %zmm3
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
vmovups __sInvLn2(%rax), %zmm4
vmovups __sShifter(%rax), %zmm1
vmovups __sLn2hi(%rax), %zmm6
@@ -432,8 +432,3 @@ ENTRY (_ZGVeN16v_expf_skx)
jmp .LBL_2_7
END (_ZGVeN16v_expf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
@@ -228,7 +228,7 @@ ENTRY (_ZGVeN16v_logf_skx)
andq $-64, %rsp
subq $1280, %rsp
movq __svml_slog_data@GOTPCREL(%rip), %rax
- vmovups .L_2il0floatpacket.7(%rip), %zmm6
+ vpternlogd $0xff, %zmm6, %zmm6, %zmm6
vmovups _iBrkValue(%rax), %zmm4
vmovups _sPoly_7(%rax), %zmm8
@@ -401,8 +401,3 @@ ENTRY (_ZGVeN16v_logf_skx)
jmp .LBL_2_7
END (_ZGVeN16v_logf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.7:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.7,@object
@@ -378,7 +378,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
vpsrlq $32, %zmm3, %zmm2
vpmovqd %zmm2, %ymm11
vcvtps2pd %ymm14, %zmm13
- vmovups .L_2il0floatpacket.23(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
vmovaps %zmm14, %zmm26
vpandd _ABSMASK(%rax), %zmm1, %zmm8
vpcmpd $1, _INF(%rax), %zmm8, %k2
@@ -420,7 +420,7 @@ ENTRY (_ZGVeN16vv_powf_skx)
vpmovqd %zmm11, %ymm5
vpxord %zmm10, %zmm10, %zmm10
vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+ vpternlogd $0xff, %zmm4, %zmm4, %zmm4
vpxord %zmm11, %zmm11, %zmm11
vcvtdq2pd %ymm7, %zmm7
vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
@@ -635,11 +635,3 @@ ENTRY (_ZGVeN16vv_powf_skx)
vmovss %xmm0, 1216(%rsp,%r15,8)
jmp .LBL_2_7
END (_ZGVeN16vv_powf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.23:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.23,@object
-.L_2il0floatpacket.24:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.24,@object
@@ -310,7 +310,7 @@ ENTRY (_ZGVeN16vl4l4_sincosf_skx)
/* Result sign calculations */
vpternlogd $150, %zmm0, %zmm14, %zmm1
- vmovups .L_2il0floatpacket.13(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
/* Add correction term 0.5 for cos() part */
vaddps %zmm8, %zmm5, %zmm15
@@ -740,8 +740,3 @@ END (_ZGVeN16vvv_sincosf_knl)
ENTRY (_ZGVeN16vvv_sincosf_skx)
WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
END (_ZGVeN16vvv_sincosf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
@@ -273,7 +273,7 @@ ENTRY (_ZGVeN16v_sinf_skx)
movq __svml_s_trig_data@GOTPCREL(%rip), %rax
/* Check for large and special values */
- vmovups .L_2il0floatpacket.11(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
vmovups __sAbsMask(%rax), %zmm5
vmovups __sInvPI(%rax), %zmm1
vmovups __sRShifter(%rax), %zmm2
@@ -464,8 +464,3 @@ ENTRY (_ZGVeN16v_sinf_skx)
vmovss %xmm0, 1216(%rsp,%r15,8)
jmp .LBL_2_7
END (_ZGVeN16v_sinf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.11:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.11,@object