[v2,2/6] x86-64: Require BMI2 for AVX2 str*cmp and wcs(n)cmp implementations
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
The AVX2 str*cmp and wcs(n)cmp implementations use the 'bzhi'
instruction, which belongs to the BMI2 CPU feature.
NB: It also uses the 'tzcnt' BMI1 instruction, but it is executed as BSF
as BSF if the CPU doesn't support TZCNT, and produces the same result
for non-zero input.
Fixes: b77b06e0e296 ("x86: Optimize strcmp-avx2.S")
Partially resolves: BZ #29611
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 47 +++++++++++++++------
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 1 +
sysdeps/x86_64/multiarch/strcmp.c | 4 +-
sysdeps/x86_64/multiarch/strncmp.c | 4 +-
4 files changed, 39 insertions(+), 17 deletions(-)
Comments
On Sun, Oct 2, 2022 at 8:34 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> The AVX2 str*cmp and wcs(n)cmp implementations use the 'bzhi'
> instruction, which belongs to the BMI2 CPU feature.
>
> NB: It also uses the 'tzcnt' BMI1 instruction, but it is executed as BSF
> as BSF if the CPU doesn't support TZCNT, and produces the same result
> for non-zero input.
>
> Fixes: b77b06e0e296 ("x86: Optimize strcmp-avx2.S")
> Partially resolves: BZ #29611
> ---
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 47 +++++++++++++++------
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 1 +
> sysdeps/x86_64/multiarch/strcmp.c | 4 +-
> sysdeps/x86_64/multiarch/strncmp.c | 4 +-
> 4 files changed, 39 insertions(+), 17 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a71444eccb..fec8790c11 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -448,13 +448,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, strcasecmp,
> X86_IFUNC_IMPL_ADD_V4 (array, i, strcasecmp,
> (CPU_FEATURE_USABLE (AVX512VL)
> - && CPU_FEATURE_USABLE (AVX512BW)),
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strcasecmp_evex)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
> - CPU_FEATURE_USABLE (AVX2),
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strcasecmp_avx2)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
> (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (RTM)),
> __strcasecmp_avx2_rtm)
> X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp,
> @@ -470,13 +473,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, strcasecmp_l,
> X86_IFUNC_IMPL_ADD_V4 (array, i, strcasecmp,
> (CPU_FEATURE_USABLE (AVX512VL)
> - && CPU_FEATURE_USABLE (AVX512BW)),
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strcasecmp_l_evex)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
> - CPU_FEATURE_USABLE (AVX2),
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strcasecmp_l_avx2)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
> (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (RTM)),
> __strcasecmp_l_avx2_rtm)
> X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp_l,
> @@ -585,10 +591,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (BMI2)),
> __strcmp_evex)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
> - CPU_FEATURE_USABLE (AVX2),
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strcmp_avx2)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
> (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (RTM)),
> __strcmp_avx2_rtm)
> X86_IFUNC_IMPL_ADD_V2 (array, i, strcmp,
> @@ -638,13 +646,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, strncasecmp,
> X86_IFUNC_IMPL_ADD_V4 (array, i, strncasecmp,
> (CPU_FEATURE_USABLE (AVX512VL)
> - && CPU_FEATURE_USABLE (AVX512BW)),
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strncasecmp_evex)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
> - CPU_FEATURE_USABLE (AVX2),
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strncasecmp_avx2)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
> (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (RTM)),
> __strncasecmp_avx2_rtm)
> X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp,
> @@ -660,13 +671,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, strncasecmp_l,
> X86_IFUNC_IMPL_ADD_V4 (array, i, strncasecmp,
> (CPU_FEATURE_USABLE (AVX512VL)
> - && CPU_FEATURE_USABLE (AVX512BW)),
> + & CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strncasecmp_l_evex)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
> - CPU_FEATURE_USABLE (AVX2),
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strncasecmp_l_avx2)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
> (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (RTM)),
> __strncasecmp_l_avx2_rtm)
> X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp_l,
> @@ -796,10 +810,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (BMI2)),
> __wcscmp_evex)
> X86_IFUNC_IMPL_ADD_V3 (array, i, wcscmp,
> - CPU_FEATURE_USABLE (AVX2),
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)),
> __wcscmp_avx2)
> X86_IFUNC_IMPL_ADD_V3 (array, i, wcscmp,
> (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (RTM)),
> __wcscmp_avx2_rtm)
> /* ISA V2 wrapper for SSE2 implementation because the SSE2
> @@ -816,10 +832,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (BMI2)),
> __wcsncmp_evex)
> X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncmp,
> - CPU_FEATURE_USABLE (AVX2),
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)),
> __wcsncmp_avx2)
> X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncmp,
> (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (RTM)),
> __wcsncmp_avx2_rtm)
> /* ISA V2 wrapper for GENERIC implementation because the
> @@ -1162,13 +1180,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL (i, name, strncmp,
> X86_IFUNC_IMPL_ADD_V4 (array, i, strncmp,
> (CPU_FEATURE_USABLE (AVX512VL)
> - && CPU_FEATURE_USABLE (AVX512BW)),
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strncmp_evex)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
> - CPU_FEATURE_USABLE (AVX2),
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)),
> __strncmp_avx2)
> X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
> (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (RTM)),
> __strncmp_avx2_rtm)
> X86_IFUNC_IMPL_ADD_V2 (array, i, strncmp,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 68646ef199..7622af259c 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -34,6 +34,7 @@ IFUNC_SELECTOR (void)
> const struct cpu_features *cpu_features = __get_cpu_features ();
>
> if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> AVX_Fast_Unaligned_Load, ))
> {
> diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
> index fdd5afe3af..9d6c9f66ba 100644
> --- a/sysdeps/x86_64/multiarch/strcmp.c
> +++ b/sysdeps/x86_64/multiarch/strcmp.c
> @@ -45,12 +45,12 @@ IFUNC_SELECTOR (void)
> const struct cpu_features *cpu_features = __get_cpu_features ();
>
> if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> AVX_Fast_Unaligned_Load, ))
> {
> if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> - && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> - && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> return OPTIMIZE (evex);
>
> if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
> index 4ebe4bde30..c4f8b6bbb5 100644
> --- a/sysdeps/x86_64/multiarch/strncmp.c
> +++ b/sysdeps/x86_64/multiarch/strncmp.c
> @@ -41,12 +41,12 @@ IFUNC_SELECTOR (void)
> const struct cpu_features *cpu_features = __get_cpu_features ();
>
> if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> AVX_Fast_Unaligned_Load, ))
> {
> if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> - && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> - && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> return OPTIMIZE (evex);
>
> if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> --
> 2.35.1
>
LGTM.
Please separate this patch into 4 separate patches.
Patch1: sysdeps/x86_64/multiarch/strncmp.c
Patch2: sysdeps/x86_64/multiarch/strcmp.c
Patch3: sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
Patch4: sysdeps/x86_64/multiarch/ifunc-impl-list.c
Rest of them looks OK to me.
On Sun, Oct 2, 2022 at 2:08 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Sun, Oct 2, 2022 at 8:34 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > The AVX2 str*cmp and wcs(n)cmp implementations use the 'bzhi'
> > instruction, which belongs to the BMI2 CPU feature.
> >
> > NB: It also uses the 'tzcnt' BMI1 instruction, but it is executed as BSF
> > as BSF if the CPU doesn't support TZCNT, and produces the same result
> > for non-zero input.
> >
> > Fixes: b77b06e0e296 ("x86: Optimize strcmp-avx2.S")
> > Partially resolves: BZ #29611
> > ---
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 47 +++++++++++++++------
> > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 1 +
> > sysdeps/x86_64/multiarch/strcmp.c | 4 +-
> > sysdeps/x86_64/multiarch/strncmp.c | 4 +-
> > 4 files changed, 39 insertions(+), 17 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a71444eccb..fec8790c11 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -448,13 +448,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > IFUNC_IMPL (i, name, strcasecmp,
> > X86_IFUNC_IMPL_ADD_V4 (array, i, strcasecmp,
> > (CPU_FEATURE_USABLE (AVX512VL)
> > - && CPU_FEATURE_USABLE (AVX512BW)),
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strcasecmp_evex)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
> > - CPU_FEATURE_USABLE (AVX2),
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strcasecmp_avx2)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
> > (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strcasecmp_avx2_rtm)
> > X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp,
> > @@ -470,13 +473,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > IFUNC_IMPL (i, name, strcasecmp_l,
> > X86_IFUNC_IMPL_ADD_V4 (array, i, strcasecmp,
> > (CPU_FEATURE_USABLE (AVX512VL)
> > - && CPU_FEATURE_USABLE (AVX512BW)),
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strcasecmp_l_evex)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
> > - CPU_FEATURE_USABLE (AVX2),
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strcasecmp_l_avx2)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
> > (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strcasecmp_l_avx2_rtm)
> > X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp_l,
> > @@ -585,10 +591,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (BMI2)),
> > __strcmp_evex)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
> > - CPU_FEATURE_USABLE (AVX2),
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strcmp_avx2)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
> > (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strcmp_avx2_rtm)
> > X86_IFUNC_IMPL_ADD_V2 (array, i, strcmp,
> > @@ -638,13 +646,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > IFUNC_IMPL (i, name, strncasecmp,
> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncasecmp,
> > (CPU_FEATURE_USABLE (AVX512VL)
> > - && CPU_FEATURE_USABLE (AVX512BW)),
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strncasecmp_evex)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
> > - CPU_FEATURE_USABLE (AVX2),
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strncasecmp_avx2)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
> > (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strncasecmp_avx2_rtm)
> > X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp,
> > @@ -660,13 +671,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > IFUNC_IMPL (i, name, strncasecmp_l,
> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncasecmp,
> > (CPU_FEATURE_USABLE (AVX512VL)
> > - && CPU_FEATURE_USABLE (AVX512BW)),
> > + & CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strncasecmp_l_evex)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
> > - CPU_FEATURE_USABLE (AVX2),
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strncasecmp_l_avx2)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
> > (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strncasecmp_l_avx2_rtm)
> > X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp_l,
> > @@ -796,10 +810,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (BMI2)),
> > __wcscmp_evex)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, wcscmp,
> > - CPU_FEATURE_USABLE (AVX2),
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __wcscmp_avx2)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, wcscmp,
> > (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __wcscmp_avx2_rtm)
> > /* ISA V2 wrapper for SSE2 implementation because the SSE2
> > @@ -816,10 +832,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (BMI2)),
> > __wcsncmp_evex)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncmp,
> > - CPU_FEATURE_USABLE (AVX2),
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __wcsncmp_avx2)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncmp,
> > (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __wcsncmp_avx2_rtm)
> > /* ISA V2 wrapper for GENERIC implementation because the
> > @@ -1162,13 +1180,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > IFUNC_IMPL (i, name, strncmp,
> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncmp,
> > (CPU_FEATURE_USABLE (AVX512VL)
> > - && CPU_FEATURE_USABLE (AVX512BW)),
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strncmp_evex)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
> > - CPU_FEATURE_USABLE (AVX2),
> > + (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > __strncmp_avx2)
> > X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
> > (CPU_FEATURE_USABLE (AVX2)
> > + && CPU_FEATURE_USABLE (BMI2)
> > && CPU_FEATURE_USABLE (RTM)),
> > __strncmp_avx2_rtm)
> > X86_IFUNC_IMPL_ADD_V2 (array, i, strncmp,
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > index 68646ef199..7622af259c 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > @@ -34,6 +34,7 @@ IFUNC_SELECTOR (void)
> > const struct cpu_features *cpu_features = __get_cpu_features ();
> >
> > if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> > AVX_Fast_Unaligned_Load, ))
> > {
> > diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
> > index fdd5afe3af..9d6c9f66ba 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp.c
> > +++ b/sysdeps/x86_64/multiarch/strcmp.c
> > @@ -45,12 +45,12 @@ IFUNC_SELECTOR (void)
> > const struct cpu_features *cpu_features = __get_cpu_features ();
> >
> > if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> > AVX_Fast_Unaligned_Load, ))
> > {
> > if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > - && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > - && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > return OPTIMIZE (evex);
> >
> > if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
> > index 4ebe4bde30..c4f8b6bbb5 100644
> > --- a/sysdeps/x86_64/multiarch/strncmp.c
> > +++ b/sysdeps/x86_64/multiarch/strncmp.c
> > @@ -41,12 +41,12 @@ IFUNC_SELECTOR (void)
> > const struct cpu_features *cpu_features = __get_cpu_features ();
> >
> > if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
> > && X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> > AVX_Fast_Unaligned_Load, ))
> > {
> > if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > - && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > - && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > return OPTIMIZE (evex);
> >
> > if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > --
> > 2.35.1
> >
>
> LGTM.
On 2022-10-03 09:19, Sunil Pandey via Libc-alpha wrote:
> Please separate this patch into 4 separate patches.
>
> Patch1: sysdeps/x86_64/multiarch/strncmp.c
> Patch2: sysdeps/x86_64/multiarch/strcmp.c
> Patch3: sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> Patch4: sysdeps/x86_64/multiarch/ifunc-impl-list.c
>
> Rest of them looks OK to me.
I don't fully see the point of doing that, but i'll do it in the next
version.
On Mon, Oct 3, 2022 at 10:35 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2022-10-03 09:19, Sunil Pandey via Libc-alpha wrote:
> > Please separate this patch into 4 separate patches.
> >
> > Patch1: sysdeps/x86_64/multiarch/strncmp.c
> > Patch2: sysdeps/x86_64/multiarch/strcmp.c
> > Patch3: sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > Patch4: sysdeps/x86_64/multiarch/ifunc-impl-list.c
> >
> > Rest of them looks OK to me.
>
> I don't fully see the point of doing that, but i'll do it in the next
> version.
I think to make backporting easier.
>
> --
> Aurelien Jarno GPG: 4096R/1DDD8C9B
> aurelien@aurel32.net http://www.aurel32.net
On Mon, Oct 3, 2022 at 10:50 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Oct 3, 2022 at 10:35 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > On 2022-10-03 09:19, Sunil Pandey via Libc-alpha wrote:
> > > Please separate this patch into 4 separate patches.
> > >
> > > Patch1: sysdeps/x86_64/multiarch/strncmp.c
> > > Patch2: sysdeps/x86_64/multiarch/strcmp.c
> > > Patch3: sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > Patch4: sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > >
> > > Rest of them looks OK to me.
> >
> > I don't fully see the point of doing that, but i'll do it in the next
> > version.
>
> I think to make backporting easier.
Exactly.
If you look closely, this patch combines 4 independent ifunc
functionality in one single patch.
As per latest backporting guideline, backport patches must apply
cleanly without any change. Having small independent patches
makes backporting a little easier.
> >
> > --
> > Aurelien Jarno GPG: 4096R/1DDD8C9B
> > aurelien@aurel32.net http://www.aurel32.net
On 2022-10-03 11:43, Sunil Pandey wrote:
> On Mon, Oct 3, 2022 at 10:50 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Oct 3, 2022 at 10:35 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > >
> > > On 2022-10-03 09:19, Sunil Pandey via Libc-alpha wrote:
> > > > Please separate this patch into 4 separate patches.
> > > >
> > > > Patch1: sysdeps/x86_64/multiarch/strncmp.c
> > > > Patch2: sysdeps/x86_64/multiarch/strcmp.c
> > > > Patch3: sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > Patch4: sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > >
> > > > Rest of them looks OK to me.
> > >
> > > I don't fully see the point of doing that, but i'll do it in the next
> > > version.
> >
> > I think to make backporting easier.
>
> Exactly.
>
> If you look closely, this patch combines 4 independent ifunc
> functionality in one single patch.
It's partially true. I actually count 3 different functionalities. The
changes in ifunc-impl-list are all related to the other 3.
> As per latest backporting guideline, backport patches must apply
> cleanly without any change. Having small independent patches
> makes backporting a little easier.
>
I agree with that when we are talking about backporting improvements. In
that case we have to backport the changes to fix the bug, even if they
don't apply cleanly.
On 2022-10-03 21:21, Aurelien Jarno wrote:
> On 2022-10-03 11:43, Sunil Pandey wrote:
> > On Mon, Oct 3, 2022 at 10:50 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Mon, Oct 3, 2022 at 10:35 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > > >
> > > > On 2022-10-03 09:19, Sunil Pandey via Libc-alpha wrote:
> > > > > Please separate this patch into 4 separate patches.
> > > > >
> > > > > Patch1: sysdeps/x86_64/multiarch/strncmp.c
> > > > > Patch2: sysdeps/x86_64/multiarch/strcmp.c
> > > > > Patch3: sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > > Patch4: sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > >
> > > > > Rest of them looks OK to me.
> > > >
> > > > I don't fully see the point of doing that, but i'll do it in the next
> > > > version.
> > >
> > > I think to make backporting easier.
> >
> > Exactly.
> >
> > If you look closely, this patch combines 4 independent ifunc
> > functionality in one single patch.
>
> It's partially true. I actually count 3 different functionalities. The
> changes in ifunc-impl-list are all related to the other 3.
I have just sent a v3 that split that patch in smaller pieces:
- str(n)casecmp
- strcmp
- strncmp
- wcs(n)cmp
I do not see the point in separating the changes from ifunc-impl-list.c,
as we should actually ensure that they are in sync with the ifunc
selector so that the testing is done properly.
On Mon, Oct 3, 2022 at 12:59 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2022-10-03 21:21, Aurelien Jarno wrote:
> > On 2022-10-03 11:43, Sunil Pandey wrote:
> > > On Mon, Oct 3, 2022 at 10:50 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Mon, Oct 3, 2022 at 10:35 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > > > >
> > > > > On 2022-10-03 09:19, Sunil Pandey via Libc-alpha wrote:
> > > > > > Please separate this patch into 4 separate patches.
> > > > > >
> > > > > > Patch1: sysdeps/x86_64/multiarch/strncmp.c
> > > > > > Patch2: sysdeps/x86_64/multiarch/strcmp.c
> > > > > > Patch3: sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> > > > > > Patch4: sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > >
> > > > > > Rest of them looks OK to me.
> > > > >
> > > > > I don't fully see the point of doing that, but i'll do it in the next
> > > > > version.
> > > >
> > > > I think to make backporting easier.
> > >
> > > Exactly.
> > >
> > > If you look closely, this patch combines 4 independent ifunc
> > > functionality in one single patch.
> >
> > It's partially true. I actually count 3 different functionalities. The
> > changes in ifunc-impl-list are all related to the other 3.
>
> I have just sent a v3 that split that patch in smaller pieces:
> - str(n)casecmp
> - strcmp
> - strncmp
> - wcs(n)cmp
>
> I do not see the point in separating the changes from ifunc-impl-list.c,
> as we should actually ensure that they are in sync with the ifunc
> selector so that the testing is done properly.
My feedback was for the specific patch for your previous arrangement.
After you rearrange in v3, it looks ok.
>
> --
> Aurelien Jarno GPG: 4096R/1DDD8C9B
> aurelien@aurel32.net http://www.aurel32.net
@@ -448,13 +448,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcasecmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strcasecmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strcasecmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strcasecmp_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp,
@@ -470,13 +473,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strcasecmp_l,
X86_IFUNC_IMPL_ADD_V4 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strcasecmp_l_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strcasecmp_l_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcasecmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strcasecmp_l_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strcasecmp_l,
@@ -585,10 +591,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)),
__strcmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strcmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strcmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strcmp_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strcmp,
@@ -638,13 +646,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strncasecmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strncasecmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strncasecmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strncasecmp_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp,
@@ -660,13 +671,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strncasecmp_l,
X86_IFUNC_IMPL_ADD_V4 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ & CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strncasecmp_l_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strncasecmp_l_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncasecmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strncasecmp_l_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strncasecmp_l,
@@ -796,10 +810,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)),
__wcscmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcscmp,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcscmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcscmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcscmp_avx2_rtm)
/* ISA V2 wrapper for SSE2 implementation because the SSE2
@@ -816,10 +832,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)),
__wcsncmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncmp,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcsncmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsncmp_avx2_rtm)
/* ISA V2 wrapper for GENERIC implementation because the
@@ -1162,13 +1180,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, strncmp,
X86_IFUNC_IMPL_ADD_V4 (array, i, strncmp,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strncmp_evex)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strncmp_avx2)
X86_IFUNC_IMPL_ADD_V3 (array, i, strncmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strncmp_avx2_rtm)
X86_IFUNC_IMPL_ADD_V2 (array, i, strncmp,
@@ -34,6 +34,7 @@ IFUNC_SELECTOR (void)
const struct cpu_features *cpu_features = __get_cpu_features ();
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
AVX_Fast_Unaligned_Load, ))
{
@@ -45,12 +45,12 @@ IFUNC_SELECTOR (void)
const struct cpu_features *cpu_features = __get_cpu_features ();
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
AVX_Fast_Unaligned_Load, ))
{
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
@@ -41,12 +41,12 @@ IFUNC_SELECTOR (void)
const struct cpu_features *cpu_features = __get_cpu_features ();
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
AVX_Fast_Unaligned_Load, ))
{
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))