[v1,21/23] x86: Add AVX2 optimized str{n}casecmp
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, AVX2 Time / SSE42 Time
1, 1, 1, 127, 1.032
2, 2, 2, 127, 1.006
3, 3, 3, 127, 1.009
4, 4, 4, 127, 0.964
5, 5, 5, 127, 0.929
6, 6, 6, 127, 0.94
7, 7, 7, 127, 0.958
8, 0, 0, 127, 0.988
9, 1, 1, 127, 0.99
10, 2, 2, 127, 0.995
11, 3, 3, 127, 0.991
12, 4, 4, 127, 0.975
13, 5, 5, 127, 0.943
14, 6, 6, 127, 0.955
15, 7, 7, 127, 0.988
4, 0, 0, 127, 0.983
4, 0, 0, 254, 0.978
8, 0, 0, 254, 0.989
16, 0, 0, 127, 0.792
16, 0, 0, 254, 0.774
32, 0, 0, 127, 0.568
32, 0, 0, 254, 0.555
64, 0, 0, 127, 0.561
64, 0, 0, 254, 0.561
128, 0, 0, 127, 0.574
128, 0, 0, 254, 0.577
256, 0, 0, 127, 0.561
256, 0, 0, 254, 0.552
512, 0, 0, 127, 0.59
512, 0, 0, 254, 0.594
1024, 0, 0, 127, 0.528
1024, 0, 0, 254, 0.517
16, 1, 2, 127, 0.758
16, 2, 1, 254, 0.748
32, 2, 4, 127, 0.419
32, 4, 2, 254, 0.428
64, 3, 6, 127, 0.472
64, 6, 3, 254, 0.464
128, 4, 0, 127, 0.534
128, 0, 4, 254, 0.53
256, 5, 2, 127, 0.679
256, 2, 5, 254, 0.676
512, 6, 4, 127, 0.525
512, 4, 6, 254, 0.523
1024, 7, 6, 127, 0.518
1024, 6, 7, 254, 0.505
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
.../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 230 +++++++++++++++---
.../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
8 files changed, 324 insertions(+), 31 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
Comments
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, AVX2 Time / SSE42 Time
> 1, 1, 1, 127, 1.032
> 2, 2, 2, 127, 1.006
> 3, 3, 3, 127, 1.009
> 4, 4, 4, 127, 0.964
> 5, 5, 5, 127, 0.929
> 6, 6, 6, 127, 0.94
> 7, 7, 7, 127, 0.958
> 8, 0, 0, 127, 0.988
> 9, 1, 1, 127, 0.99
> 10, 2, 2, 127, 0.995
> 11, 3, 3, 127, 0.991
> 12, 4, 4, 127, 0.975
> 13, 5, 5, 127, 0.943
> 14, 6, 6, 127, 0.955
> 15, 7, 7, 127, 0.988
> 4, 0, 0, 127, 0.983
> 4, 0, 0, 254, 0.978
> 8, 0, 0, 254, 0.989
> 16, 0, 0, 127, 0.792
> 16, 0, 0, 254, 0.774
> 32, 0, 0, 127, 0.568
> 32, 0, 0, 254, 0.555
> 64, 0, 0, 127, 0.561
> 64, 0, 0, 254, 0.561
> 128, 0, 0, 127, 0.574
> 128, 0, 0, 254, 0.577
> 256, 0, 0, 127, 0.561
> 256, 0, 0, 254, 0.552
> 512, 0, 0, 127, 0.59
> 512, 0, 0, 254, 0.594
> 1024, 0, 0, 127, 0.528
> 1024, 0, 0, 254, 0.517
> 16, 1, 2, 127, 0.758
> 16, 2, 1, 254, 0.748
> 32, 2, 4, 127, 0.419
> 32, 4, 2, 254, 0.428
> 64, 3, 6, 127, 0.472
> 64, 6, 3, 254, 0.464
> 128, 4, 0, 127, 0.534
> 128, 0, 4, 254, 0.53
> 256, 5, 2, 127, 0.679
> 256, 2, 5, 254, 0.676
> 512, 6, 4, 127, 0.525
> 512, 4, 6, 254, 0.523
> 1024, 7, 6, 127, 0.518
> 1024, 6, 7, 254, 0.505
>
> sysdeps/x86_64/multiarch/Makefile | 4 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
> .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
> sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
> sysdeps/x86_64/multiarch/strcmp-avx2.S | 230 +++++++++++++++---
> .../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
> sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
> 8 files changed, 324 insertions(+), 31 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7b413edad..06e1848823 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -55,6 +55,8 @@ sysdep_routines += \
> stpncpy-sse2-unaligned \
> stpncpy-ssse3 \
> strcasecmp_l-avx \
> + strcasecmp_l-avx2 \
> + strcasecmp_l-avx2-rtm \
> strcasecmp_l-sse2 \
> strcasecmp_l-sse4_2 \
> strcasecmp_l-ssse3 \
> @@ -93,6 +95,8 @@ sysdep_routines += \
> strlen-evex \
> strlen-sse2 \
> strncase_l-avx \
> + strncase_l-avx2 \
> + strncase_l-avx2-rtm \
> strncase_l-sse2 \
> strncase_l-sse4_2 \
> strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a594f4176e..3c556d07ac 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strcasecmp_avx2)
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strcasecmp_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX),
> __strcasecmp_avx)
> @@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strcasecmp_l_avx2)
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strcasecmp_l_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> CPU_FEATURE_USABLE (AVX),
> __strcasecmp_l_avx)
> @@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strncasecmp_avx2)
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strncasecmp_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX),
> __strncasecmp_avx)
> @@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + CPU_FEATURE_USABLE (AVX2),
> + __strncasecmp_l_avx2)
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __strncasecmp_l_avx2_rtm)
> IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> CPU_FEATURE_USABLE (AVX),
> __strncasecmp_l_avx)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 9e3cc61ac0..c4de111fd0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>
> static inline void *
> IFUNC_SELECTOR (void)
> {
> const struct cpu_features* cpu_features = __get_cpu_features ();
>
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> + return OPTIMIZE (avx2_rtm);
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> + return OPTIMIZE (avx2);
> + }
> +
> if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
> return OPTIMIZE (avx);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..09957fc3c5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
> @@ -0,0 +1,15 @@
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x) x ## _rtm
> +#define GLABEL(x) _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> +
> +#define SECTION(p) p##.avx.rtm
> +
> +#include "strcasecmp_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> new file mode 100644
> index 0000000000..e2762f2a22
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with AVX2.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 86a86b68e3..eeb90a0da6 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -20,6 +20,10 @@
>
> # include <sysdep.h>
>
> +# if defined USE_AS_STRCASECMP_L
> +# include "locale-defines.h"
> +# endif
> +
> # ifndef STRCMP
> # define STRCMP __strcmp_avx2
> # endif
> @@ -74,13 +78,88 @@
> # define VEC_OFFSET (-VEC_SIZE)
> # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +# define BYTE_LOOP_REG OFFSET_REG
> +# else
> +# define BYTE_LOOP_REG ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# ifdef USE_AS_STRNCMP
> +# define STRCASECMP __strncasecmp_avx2
> +# define LOCALE_REG rcx
> +# define LOCALE_REG_LP RCX_LP
> +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +# else
> +# define STRCASECMP __strcasecmp_avx2
> +# define LOCALE_REG rdx
> +# define LOCALE_REG_LP RDX_LP
> +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +# endif
> +# endif
> +
> # define xmmZERO xmm15
> # define ymmZERO ymm15
>
> +# define LCASE_MIN_ymm %ymm10
> +# define LCASE_MAX_ymm %ymm11
> +# define CASE_ADD_ymm %ymm12
> +
> +# define LCASE_MIN_xmm %xmm10
> +# define LCASE_MAX_xmm %xmm11
> +# define CASE_ADD_xmm %xmm12
> +
> + /* r11 is never use elsewhere so this is safe to maintain. */
> +# define TOLOWER_BASE %r11
> +
> # ifndef SECTION
> # define SECTION(p) p##.avx
> # endif
>
> +# ifdef USE_AS_STRCASECMP_L
> +# define REG(x, y) x ## y
> +# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
> + vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
> + vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
> + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
> + vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
> + vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
> + vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
> + vpaddb REG(%ext, 8), reg1_in, reg1_out; \
> + vpaddb REG(%ext, 9), reg2_in, reg2_out
> +
> +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
> +# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
> +
> +# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
> + TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
> + VPCMPEQ scratch_reg, s2_reg, reg_out
> +
> +# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
> + VMOVU s2_mem, reg_out; \
> + CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
> +
> +# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
> +# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
> +
> +# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
> +# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
> +
> +# else
> +# define TOLOWER_gpr(...)
> +# define TOLOWER_ymm(...)
> +# define TOLOWER_xmm(...)
> +
> +# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
> + VPCMPEQ s2_reg, s1_reg, reg_out
> +
> +# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +
> +# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
> +# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
> +# endif
> +
> /* Warning!
> wcscmp/wcsncmp have to use SIGNED comparison for elements.
> strcmp/strncmp have to use UNSIGNED comparison for elements.
> @@ -102,7 +181,45 @@
> returned. */
>
> .section SECTION(.text), "ax", @progbits
> -ENTRY(STRCMP)
> + .align 16
> + .type STRCMP, @function
> + .globl STRCMP
> + .hidden STRCMP
> +
> +# ifndef GLABEL
> +# define GLABEL(...) __VA_ARGS__
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (GLABEL(STRCASECMP))
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %LOCALE_REG_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END (GLABEL(STRCASECMP))
> + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> +# endif
> +
> + .p2align 4
> +STRCMP:
> + cfi_startproc
> + _CET_ENDBR
> + CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> + /* We have to fall back on the C implementation for locales with
> + encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +# else
> + mov (%LOCALE_REG), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> + jne STRCASECMP_NONASCII
> + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
> # ifdef USE_AS_STRNCMP
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> @@ -128,6 +245,30 @@ ENTRY(STRCMP)
> # endif
> # endif
> vpxor %xmmZERO, %xmmZERO, %xmmZERO
> +# if defined USE_AS_STRCASECMP_L
> + .section .rodata.cst32, "aM", @progbits, 32
> + .align 32
> +L(lcase_min):
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +L(lcase_max):
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +L(case_add):
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .previous
> +
> + vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
> + vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
> + vmovdqa L(case_add)(%rip), CASE_ADD_ymm
> +# endif
> movl %edi, %eax
> orl %esi, %eax
> sall $20, %eax
> @@ -138,8 +279,10 @@ ENTRY(STRCMP)
> L(no_page_cross):
> /* Safe to compare 4x vectors. */
> VMOVU (%rdi), %ymm0
> - /* 1s where s1 and s2 equal. */
> - VPCMPEQ (%rsi), %ymm0, %ymm1
> + /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
> + Otherwise converts ymm0 and load from rsi to lower. ymm2 is
> + scratch and ymm1 is the return. */
> + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> /* 1s at null CHAR. */
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> /* 1s where s1 and s2 equal AND not null CHAR. */
> @@ -172,6 +315,8 @@ L(return_vec_0):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret0):
> @@ -207,6 +352,8 @@ L(one_or_less):
> # else
> movzbl (%rdi), %eax
> movzbl (%rsi), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret1):
> @@ -234,6 +381,8 @@ L(return_vec_1):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret2):
> @@ -265,6 +414,8 @@ L(return_vec_2):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret3):
> @@ -285,6 +436,8 @@ L(return_vec_3):
> # else
> movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret4):
> @@ -295,7 +448,7 @@ L(ret4):
> L(more_3x_vec):
> /* Safe to compare 4x vectors. */
> VMOVU VEC_SIZE(%rdi), %ymm0
> - VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -308,7 +461,7 @@ L(more_3x_vec):
> # endif
>
> VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -316,7 +469,7 @@ L(more_3x_vec):
> jnz L(return_vec_2)
>
> VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -391,12 +544,10 @@ L(loop_skip_page_cross_check):
> VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
>
> /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
> - VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
> -
> - VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> -
> + CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
> + CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
> + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
>
> /* If any mismatches or null CHAR then 0 CHAR, otherwise non-
> zero. */
> @@ -465,6 +616,8 @@ L(return_vec_2_3_end):
> # else
> movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
> movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -508,6 +661,8 @@ L(return_vec_0_end):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -530,6 +685,8 @@ L(return_vec_1_end):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -556,6 +713,8 @@ L(return_vec_2_end):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -583,7 +742,7 @@ L(page_cross_during_loop):
> jle L(less_1x_vec_till_page_cross)
>
> VMOVA (%rdi), %ymm0
> - VPCMPEQ (%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -605,7 +764,7 @@ L(less_1x_vec_till_page_cross):
> here, it means the previous page (rdi - VEC_SIZE) has already
> been loaded earlier so must be valid. */
> VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
> - VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -647,6 +806,8 @@ L(return_page_cross_cmp_mem):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -673,7 +834,7 @@ L(more_2x_vec_till_page_cross):
> iteration here. */
>
> VMOVU VEC_SIZE(%rdi), %ymm0
> - VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -689,7 +850,7 @@ L(more_2x_vec_till_page_cross):
>
> /* Safe to include comparisons from lower bytes. */
> VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
> - VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -697,7 +858,7 @@ L(more_2x_vec_till_page_cross):
> jnz L(return_vec_page_cross_0)
>
> VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
> - VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -715,8 +876,8 @@ L(more_2x_vec_till_page_cross):
> VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
> VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
>
> - VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
> - VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
> + CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
> + CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
> vpand %ymm4, %ymm5, %ymm5
> vpand %ymm6, %ymm7, %ymm7
> VPMINU %ymm5, %ymm7, %ymm7
> @@ -767,6 +928,8 @@ L(return_vec_page_cross_1):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -822,7 +985,7 @@ L(page_cross):
> L(page_cross_loop):
>
> VMOVU (%rdi, %OFFSET_REG64), %ymm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -840,11 +1003,11 @@ L(page_cross_loop):
> subl %eax, %OFFSET_REG
> /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
> to not cross page so is safe to load. Since we have already
> - loaded at least 1 VEC from rsi it is also guranteed to be safe.
> - */
> + loaded at least 1 VEC from rsi it is also guranteed to be
> + safe. */
>
> VMOVU (%rdi, %OFFSET_REG64), %ymm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
> + CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
> VPCMPEQ %ymm0, %ymmZERO, %ymm2
> vpandn %ymm1, %ymm2, %ymm1
> vpmovmskb %ymm1, %ecx
> @@ -877,6 +1040,8 @@ L(ret_vec_page_cross_cont):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -930,7 +1095,7 @@ L(less_1x_vec_till_page):
> ja L(less_16_till_page)
>
> VMOVU (%rdi), %xmm0
> - VPCMPEQ (%rsi), %xmm0, %xmm1
> + CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> @@ -948,7 +1113,7 @@ L(less_1x_vec_till_page):
> # endif
>
> VMOVU (%rdi, %OFFSET_REG64), %xmm0
> - VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
> + CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> @@ -986,7 +1151,7 @@ L(less_16_till_page):
> vmovq (%rdi), %xmm0
> vmovq (%rsi), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> incb %cl
> @@ -1006,7 +1171,7 @@ L(less_16_till_page):
> vmovq (%rdi, %OFFSET_REG64), %xmm0
> vmovq (%rsi, %OFFSET_REG64), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> incb %cl
> @@ -1062,7 +1227,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi), %xmm0
> vmovd (%rsi), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> subl $0xf, %ecx
> @@ -1081,7 +1246,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi, %OFFSET_REG64), %xmm0
> vmovd (%rsi, %OFFSET_REG64), %xmm1
> VPCMPEQ %xmm0, %xmmZERO, %xmm2
> - VPCMPEQ %xmm1, %xmm0, %xmm1
> + CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
> vpandn %xmm1, %xmm2, %xmm1
> vpmovmskb %ymm1, %ecx
> subl $0xf, %ecx
> @@ -1115,7 +1280,9 @@ L(less_4_till_page):
> L(less_4_loop):
> movzbl (%rdi), %eax
> movzbl (%rsi, %rdi), %ecx
> - subl %ecx, %eax
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> + subl %BYTE_LOOP_REG, %eax
> jnz L(ret_less_4_loop)
> testl %ecx, %ecx
> jz L(ret_zero_4_loop)
> @@ -1142,5 +1309,6 @@ L(ret_less_4_loop):
> subl %r8d, %eax
> ret
> # endif
> -END(STRCMP)
> + cfi_endproc
> + .size STRCMP, .-STRCMP
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> new file mode 100644
> index 0000000000..e194936c36
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
> @@ -0,0 +1,16 @@
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_avx2_rtm
> +#endif
> +
> +#define _GLABEL(x) x ## _rtm
> +#define GLABEL(x) _GLABEL(x)
> +
> +#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> +
> +#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> +
> +#define SECTION(p) p##.avx.rtm
> +#define OVERFLOW_STRCMP __strcasecmp_avx2_rtm
> +
> +#include "strncase_l-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> new file mode 100644
> index 0000000000..29afccbcc5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
> @@ -0,0 +1,27 @@
> +/* strncasecmp_l optimized with AVX2.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_avx2
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __strcasecmp_avx2
> +#endif
> +#include "strcmp-avx2.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
@@ -55,6 +55,8 @@ sysdep_routines += \
stpncpy-sse2-unaligned \
stpncpy-ssse3 \
strcasecmp_l-avx \
+ strcasecmp_l-avx2 \
+ strcasecmp_l-avx2-rtm \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
strlen-evex \
strlen-sse2 \
strncase_l-avx \
+ strncase_l-avx2 \
+ strncase_l-avx2-rtm \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_l_avx)
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
new file mode 100644
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
new file mode 100644
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
@@ -20,6 +20,10 @@
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
+
# ifndef STRCMP
# define STRCMP __strcmp_avx2
# endif
@@ -74,13 +78,88 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_avx2
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_avx2
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
# define xmmZERO xmm15
# define ymmZERO ymm15
+# define LCASE_MIN_ymm %ymm10
+# define LCASE_MAX_ymm %ymm11
+# define CASE_ADD_ymm %ymm12
+
+# define LCASE_MIN_xmm %xmm10
+# define LCASE_MAX_xmm %xmm11
+# define CASE_ADD_xmm %xmm12
+
+ /* r11 is never use elsewhere so this is safe to maintain. */
+# define TOLOWER_BASE %r11
+
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define REG(x, y) x ## y
+# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
+ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
+ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpaddb REG(%ext, 8), reg1_in, reg1_out; \
+ vpaddb REG(%ext, 9), reg2_in, reg2_out
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
+# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
+
+# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
+ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
+ VPCMPEQ scratch_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
+ VMOVU s2_mem, reg_out; \
+ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_ymm(...)
+# define TOLOWER_xmm(...)
+
+# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
+ VPCMPEQ s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,7 +181,45 @@
returned. */
.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifndef GLABEL
+# define GLABEL(...) __VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (GLABEL(STRCASECMP))
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
# ifdef __ILP32__
/* Clear the upper 32 bits. */
@@ -128,6 +245,30 @@ ENTRY(STRCMP)
# endif
# endif
vpxor %xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
+ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
+ vmovdqa L(case_add)(%rip), CASE_ADD_ymm
+# endif
movl %edi, %eax
orl %esi, %eax
sall $20, %eax
@@ -138,8 +279,10 @@ ENTRY(STRCMP)
L(no_page_cross):
/* Safe to compare 4x vectors. */
VMOVU (%rdi), %ymm0
- /* 1s where s1 and s2 equal. */
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+ Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+ scratch and ymm1 is the return. */
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
/* 1s at null CHAR. */
VPCMPEQ %ymm0, %ymmZERO, %ymm2
/* 1s where s1 and s2 equal AND not null CHAR. */
@@ -172,6 +315,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -207,6 +352,8 @@ L(one_or_less):
# else
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -234,6 +381,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -265,6 +414,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -285,6 +436,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -295,7 +448,7 @@ L(ret4):
L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -308,7 +461,7 @@ L(more_3x_vec):
# endif
VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -316,7 +469,7 @@ L(more_3x_vec):
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -391,12 +544,10 @@ L(loop_skip_page_cross_check):
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
zero. */
@@ -465,6 +616,8 @@ L(return_vec_2_3_end):
# else
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -508,6 +661,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -530,6 +685,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -556,6 +713,8 @@ L(return_vec_2_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -583,7 +742,7 @@ L(page_cross_during_loop):
jle L(less_1x_vec_till_page_cross)
VMOVA (%rdi), %ymm0
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -605,7 +764,7 @@ L(less_1x_vec_till_page_cross):
here, it means the previous page (rdi - VEC_SIZE) has already
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -647,6 +806,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -673,7 +834,7 @@ L(more_2x_vec_till_page_cross):
iteration here. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -689,7 +850,7 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -697,7 +858,7 @@ L(more_2x_vec_till_page_cross):
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -715,8 +876,8 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
vpand %ymm4, %ymm5, %ymm5
vpand %ymm6, %ymm7, %ymm7
VPMINU %ymm5, %ymm7, %ymm7
@@ -767,6 +928,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -822,7 +985,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -840,11 +1003,11 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
to not cross page so is safe to load. Since we have already
- loaded at least 1 VEC from rsi it is also guranteed to be safe.
- */
+ loaded at least 1 VEC from rsi it is also guranteed to be
+ safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -877,6 +1040,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -930,7 +1095,7 @@ L(less_1x_vec_till_page):
ja L(less_16_till_page)
VMOVU (%rdi), %xmm0
- VPCMPEQ (%rsi), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -948,7 +1113,7 @@ L(less_1x_vec_till_page):
# endif
VMOVU (%rdi, %OFFSET_REG64), %xmm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -986,7 +1151,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1006,7 +1171,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64), %xmm0
vmovq (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1062,7 +1227,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1081,7 +1246,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64), %xmm0
vmovd (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1115,7 +1280,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1142,5 +1309,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
new file mode 100644
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+#define OVERFLOW_STRCMP __strcasecmp_avx2_rtm
+
+#include "strncase_l-avx2.S"
new file mode 100644
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcasecmp_avx2
+#endif
+#include "strcmp-avx2.S"