[v1,22/23] x86: Add EVEX optimized str{n}casecmp
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, EVEX Time / SSE42 Time
1, 1, 1, 127, 0.871
2, 2, 2, 127, 0.833
3, 3, 3, 127, 0.851
4, 4, 4, 127, 0.824
5, 5, 5, 127, 0.791
6, 6, 6, 127, 0.789
7, 7, 7, 127, 0.804
8, 0, 0, 127, 0.838
9, 1, 1, 127, 0.837
10, 2, 2, 127, 0.834
11, 3, 3, 127, 0.839
12, 4, 4, 127, 0.844
13, 5, 5, 127, 0.796
14, 6, 6, 127, 0.811
15, 7, 7, 127, 0.838
4, 0, 0, 127, 0.84
4, 0, 0, 254, 0.823
8, 0, 0, 254, 0.838
16, 0, 0, 127, 0.669
16, 0, 0, 254, 0.656
32, 0, 0, 127, 0.488
32, 0, 0, 254, 0.484
64, 0, 0, 127, 0.492
64, 0, 0, 254, 0.502
128, 0, 0, 127, 0.508
128, 0, 0, 254, 0.497
256, 0, 0, 127, 0.574
256, 0, 0, 254, 0.581
512, 0, 0, 127, 0.573
512, 0, 0, 254, 0.577
1024, 0, 0, 127, 0.489
1024, 0, 0, 254, 0.485
16, 1, 2, 127, 0.655
16, 2, 1, 254, 0.646
32, 2, 4, 127, 0.368
32, 4, 2, 254, 0.376
64, 3, 6, 127, 0.428
64, 6, 3, 254, 0.426
128, 4, 0, 127, 0.478
128, 0, 4, 254, 0.473
256, 5, 2, 127, 0.65
256, 2, 5, 254, 0.654
512, 6, 4, 127, 0.492
512, 4, 6, 254, 0.489
1024, 7, 6, 127, 0.463
1024, 6, 7, 254, 0.457
sysdeps/x86_64/multiarch/Makefile | 2 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 ++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-evex.S | 280 ++++++++++++++++---
sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
6 files changed, 314 insertions(+), 37 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
Comments
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, EVEX Time / SSE42 Time
> 1, 1, 1, 127, 0.871
> 2, 2, 2, 127, 0.833
> 3, 3, 3, 127, 0.851
> 4, 4, 4, 127, 0.824
> 5, 5, 5, 127, 0.791
> 6, 6, 6, 127, 0.789
> 7, 7, 7, 127, 0.804
> 8, 0, 0, 127, 0.838
> 9, 1, 1, 127, 0.837
> 10, 2, 2, 127, 0.834
> 11, 3, 3, 127, 0.839
> 12, 4, 4, 127, 0.844
> 13, 5, 5, 127, 0.796
> 14, 6, 6, 127, 0.811
> 15, 7, 7, 127, 0.838
> 4, 0, 0, 127, 0.84
> 4, 0, 0, 254, 0.823
> 8, 0, 0, 254, 0.838
> 16, 0, 0, 127, 0.669
> 16, 0, 0, 254, 0.656
> 32, 0, 0, 127, 0.488
> 32, 0, 0, 254, 0.484
> 64, 0, 0, 127, 0.492
> 64, 0, 0, 254, 0.502
> 128, 0, 0, 127, 0.508
> 128, 0, 0, 254, 0.497
> 256, 0, 0, 127, 0.574
> 256, 0, 0, 254, 0.581
> 512, 0, 0, 127, 0.573
> 512, 0, 0, 254, 0.577
> 1024, 0, 0, 127, 0.489
> 1024, 0, 0, 254, 0.485
> 16, 1, 2, 127, 0.655
> 16, 2, 1, 254, 0.646
> 32, 2, 4, 127, 0.368
> 32, 4, 2, 254, 0.376
> 64, 3, 6, 127, 0.428
> 64, 6, 3, 254, 0.426
> 128, 4, 0, 127, 0.478
> 128, 0, 4, 254, 0.473
> 256, 5, 2, 127, 0.65
> 256, 2, 5, 254, 0.654
> 512, 6, 4, 127, 0.492
> 512, 4, 6, 254, 0.489
> 1024, 7, 6, 127, 0.463
> 1024, 6, 7, 254, 0.457
>
> sysdeps/x86_64/multiarch/Makefile | 2 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 ++
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 +
> sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++
> sysdeps/x86_64/multiarch/strcmp-evex.S | 280 ++++++++++++++++---
> sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++
> 6 files changed, 314 insertions(+), 37 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 06e1848823..35d80dc2ff 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -57,6 +57,7 @@ sysdep_routines += \
> strcasecmp_l-avx \
> strcasecmp_l-avx2 \
> strcasecmp_l-avx2-rtm \
> + strcasecmp_l-evex \
> strcasecmp_l-sse2 \
> strcasecmp_l-sse4_2 \
> strcasecmp_l-ssse3 \
> @@ -97,6 +98,7 @@ sysdep_routines += \
> strncase_l-avx \
> strncase_l-avx2 \
> strncase_l-avx2-rtm \
> + strncase_l-evex \
> strncase_l-sse2 \
> strncase_l-sse4_2 \
> strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 3c556d07ac..f1a4d3dac2 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strcasecmp_evex)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strcasecmp_avx2)
> @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> IFUNC_IMPL (i, name, strcasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strcasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strcasecmp_l_evex)
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strcasecmp_l_avx2)
> @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strncasecmp_evex)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strncasecmp_avx2)
> @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/strncase_l.c. */
> IFUNC_IMPL (i, name, strncasecmp_l,
> + IFUNC_IMPL_ADD (array, i, strncasecmp,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)),
> + __strncasecmp_l_evex)
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (AVX2),
> __strncasecmp_l_avx2)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index c4de111fd0..bf0d146e7f 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>
> static inline void *
> IFUNC_SELECTOR (void)
> @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
> {
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> + return OPTIMIZE (evex);
> +
> if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> return OPTIMIZE (avx2_rtm);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> new file mode 100644
> index 0000000000..58642db748
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with EVEX.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strcasecmp_l_evex
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index 56d8c118e4..85afd6535f 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -19,6 +19,9 @@
> #if IS_IN (libc)
>
> # include <sysdep.h>
> +# if defined USE_AS_STRCASECMP_L
> +# include "locale-defines.h"
> +# endif
>
> # ifndef STRCMP
> # define STRCMP __strcmp_evex
> @@ -34,19 +37,29 @@
> # define VMOVA vmovdqa64
>
> # ifdef USE_AS_WCSCMP
> -# define TESTEQ subl $0xff,
> +# ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __wcscmp_evex
> +# endif
> +
> +# define TESTEQ subl $0xff,
> /* Compare packed dwords. */
> # define VPCMP vpcmpd
> # define VPMINU vpminud
> # define VPTESTM vptestmd
> +# define VPTESTNM vptestnmd
> /* 1 dword char == 4 bytes. */
> # define SIZE_OF_CHAR 4
> # else
> +# ifndef OVERFLOW_STRCMP
> +# define OVERFLOW_STRCMP __strcmp_evex
> +# endif
> +
> # define TESTEQ incl
> /* Compare packed bytes. */
> # define VPCMP vpcmpb
> # define VPMINU vpminub
> # define VPTESTM vptestmb
> +# define VPTESTNM vptestnmb
> /* 1 byte char == 1 byte. */
> # define SIZE_OF_CHAR 1
> # endif
> @@ -73,11 +86,16 @@
> # define VEC_OFFSET (-VEC_SIZE)
> # endif
>
> -# define XMMZERO xmm16
> # define XMM0 xmm17
> # define XMM1 xmm18
>
> -# define YMMZERO ymm16
> +# define XMM10 xmm27
> +# define XMM11 xmm28
> +# define XMM12 xmm29
> +# define XMM13 xmm30
> +# define XMM14 xmm31
> +
> +
> # define YMM0 ymm17
> # define YMM1 ymm18
> # define YMM2 ymm19
> @@ -89,6 +107,87 @@
> # define YMM8 ymm25
> # define YMM9 ymm26
> # define YMM10 ymm27
> +# define YMM11 ymm28
> +# define YMM12 ymm29
> +# define YMM13 ymm30
> +# define YMM14 ymm31
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# define BYTE_LOOP_REG OFFSET_REG
> +# else
> +# define BYTE_LOOP_REG ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# ifdef USE_AS_STRNCMP
> +# define STRCASECMP __strncasecmp_evex
> +# define LOCALE_REG rcx
> +# define LOCALE_REG_LP RCX_LP
> +# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +# else
> +# define STRCASECMP __strcasecmp_evex
> +# define LOCALE_REG rdx
> +# define LOCALE_REG_LP RDX_LP
> +# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +# endif
> +# endif
> +
> +# define LCASE_MIN_YMM %YMM12
> +# define LCASE_MAX_YMM %YMM13
> +# define CASE_ADD_YMM %YMM14
> +
> +# define LCASE_MIN_XMM %XMM12
> +# define LCASE_MAX_XMM %XMM13
> +# define CASE_ADD_XMM %XMM14
> +
> + /* NB: wcsncmp uses r11 but strcasecmp is never used in
> + conjunction with wcscmp. */
> +# define TOLOWER_BASE %r11
> +
> +# ifdef USE_AS_STRCASECMP_L
> +# define _REG(x, y) x ## y
> +# define REG(x, y) _REG(x, y)
> +# define TOLOWER(reg1, reg2, ext) \
> + vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
> + vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
> + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
> + vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
> + vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
> + vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
> +
> +# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
> +# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
> +
> +# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
> + TOLOWER (s1_reg, s2_reg, ext); \
> + VPCMP $0, s1_reg, s2_reg, reg_out
> +
> +# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
> + VMOVU s2_mem, s2_reg; \
> + CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> +
> +# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> +# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> +
> +# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> +# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> +
> +# else
> +# define TOLOWER_gpr(...)
> +# define TOLOWER_YMM(...)
> +# define TOLOWER_XMM(...)
> +
> +# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
> + VPCMP $0, s2_reg, s1_reg, reg_out
> +
> +# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> +
> +# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
> + VPCMP $0, s2_mem, s1_reg, reg_out
> +
> +# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> +# endif
>
> /* Warning!
> wcscmp/wcsncmp have to use SIGNED comparison for elements.
> @@ -112,7 +211,41 @@
> returned. */
>
> .section .text.evex, "ax", @progbits
> -ENTRY(STRCMP)
> + .align 16
> + .type STRCMP, @function
> + .globl STRCMP
> + .hidden STRCMP
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (STRCASECMP)
> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
> + mov %fs:(%rax), %LOCALE_REG_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END (STRCASECMP)
> + /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
> +# endif
> +
> + .p2align 4
> +STRCMP:
> + cfi_startproc
> + _CET_ENDBR
> + CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> + /* We have to fall back on the C implementation for locales with
> + encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +# else
> + mov (%LOCALE_REG), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> + jne STRCASECMP_NONASCII
> + leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
> # ifdef USE_AS_STRNCMP
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> @@ -125,6 +258,32 @@ ENTRY(STRCMP)
> actually bound the buffer. */
> jle L(one_or_less)
> # endif
> +
> +# if defined USE_AS_STRCASECMP_L
> + .section .rodata.cst32, "aM", @progbits, 32
> + .align 32
> +L(lcase_min):
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> + .quad 0x4141414141414141
> +L(lcase_max):
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> + .quad 0x1a1a1a1a1a1a1a1a
> +L(case_add):
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .previous
> +
> + vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> + vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> + vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> +# endif
> +
> movl %edi, %eax
> orl %esi, %eax
> /* Shift out the bits irrelivant to page boundary ([63:12]). */
> @@ -139,7 +298,7 @@ L(no_page_cross):
> VPTESTM %YMM0, %YMM0, %k2
> /* Each bit cleared in K1 represents a mismatch or a null CHAR
> in YMM0 and 32 bytes at (%rsi). */
> - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_STRNCMP
> cmpq $CHAR_PER_VEC, %rdx
> @@ -169,6 +328,8 @@ L(return_vec_0):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret0):
> @@ -192,7 +353,7 @@ L(one_or_less):
> # ifdef USE_AS_WCSCMP
> /* 'nbe' covers the case where length is negative (large
> unsigned). */
> - jnbe __wcscmp_evex
> + jnbe OVERFLOW_STRCMP
> movl (%rdi), %edx
> xorl %eax, %eax
> cmpl (%rsi), %edx
> @@ -203,9 +364,11 @@ L(one_or_less):
> # else
> /* 'nbe' covers the case where length is negative (large
> unsigned). */
> - jnbe __strcmp_evex
> + jnbe OVERFLOW_STRCMP
> movzbl (%rdi), %eax
> movzbl (%rsi), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret1):
> @@ -233,6 +396,8 @@ L(return_vec_1):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret2):
> @@ -270,6 +435,8 @@ L(return_vec_2):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret3):
> @@ -290,6 +457,8 @@ L(return_vec_3):
> # else
> movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
> movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> # endif
> L(ret4):
> @@ -303,7 +472,7 @@ L(more_3x_vec):
> /* Safe to compare 4x vectors. */
> VMOVU (VEC_SIZE)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1)
> @@ -315,14 +484,14 @@ L(more_3x_vec):
>
> VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_2)
>
> VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_3)
> @@ -381,7 +550,6 @@ L(prepare_loop_aligned):
> subl %esi, %eax
> andl $(PAGE_SIZE - 1), %eax
>
> - vpxorq %YMMZERO, %YMMZERO, %YMMZERO
>
> /* Loop 4x comparisons at a time. */
> .p2align 4
> @@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
> /* A zero CHAR in YMM9 means that there is a null CHAR. */
> VPMINU %YMM8, %YMM9, %YMM9
>
> - /* Each bit set in K1 represents a non-null CHAR in YMM8. */
> + /* Each bit set in K1 represents a non-null CHAR in YMM9. */
> VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
> vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
> oring with YMM1. Result is stored in YMM6. */
> vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> -
> +# else
> + VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
> + TOLOWER_YMM (%YMM0, %YMM1)
> + VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
> + TOLOWER_YMM (%YMM2, %YMM3)
> + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> + TOLOWER_YMM (%YMM4, %YMM5)
> + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> + TOLOWER_YMM (%YMM6, %YMM7)
> + vpxorq %YMM0, %YMM1, %YMM1
> + vpxorq %YMM2, %YMM3, %YMM3
> + vpxorq %YMM4, %YMM5, %YMM5
> + vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> +# endif
> /* Or together YMM3, YMM5, and YMM6. */
> vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
>
>
> /* A non-zero CHAR in YMM6 represents a mismatch. */
> - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> kmovd %k0, %LOOP_REG
>
> TESTEQ %LOOP_REG
> @@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
>
> /* Find which VEC has the mismatch of end of string. */
> VPTESTM %YMM0, %YMM0, %k1
> - VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
> + VPTESTNM %YMM1, %YMM1, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> jnz L(return_vec_0_end)
>
> VPTESTM %YMM2, %YMM2, %k1
> - VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
> + VPTESTNM %YMM3, %YMM3, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1_end)
> @@ -457,7 +638,7 @@ L(return_vec_2_3_end):
> # endif
>
> VPTESTM %YMM4, %YMM4, %k1
> - VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
> + VPTESTNM %YMM5, %YMM5, %k0{%k1}
> kmovd %k0, %ecx
> TESTEQ %ecx
> # if CHAR_PER_VEC <= 16
> @@ -493,6 +674,8 @@ L(return_vec_3_end):
> # else
> movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -545,6 +728,8 @@ L(return_vec_0_end):
> # else
> movzbl (%rdi, %rcx), %eax
> movzbl (%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> logic. Subtract `r8d` after xor for zero case. */
> @@ -569,6 +754,8 @@ L(return_vec_1_end):
> # else
> movzbl VEC_SIZE(%rdi, %rcx), %eax
> movzbl VEC_SIZE(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -598,7 +785,7 @@ L(page_cross_during_loop):
>
> VMOVA (%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_0_end)
> @@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
> been loaded earlier so must be valid. */
> VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> -
> + CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> /* Mask of potentially valid bits. The lower bits can be out of
> range comparisons (but safe regarding page crosses). */
>
> @@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
>
> # ifdef USE_AS_STRNCMP
> # ifdef USE_AS_WCSCMP
> + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> + safe. */
> movl %eax, %r11d
> shrl $2, %r11d
> cmpq %r11, %rdx
> @@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
>
> VMOVA VEC_SIZE(%rdi), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_1_end)
> @@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
> /* Safe to include comparisons from lower bytes. */
> VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_page_cross_0)
>
> VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(return_vec_page_cross_1)
> @@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
> /* Must check length here as length might proclude reading next
> page. */
> # ifdef USE_AS_WCSCMP
> + /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> + safe. */
> movl %eax, %r11d
> shrl $2, %r11d
> cmpq %r11, %rdx
> @@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
> VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
> VPMINU %YMM4, %YMM6, %YMM9
> VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
> vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
> vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> -
> - VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
> +# else
> + VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
> + TOLOWER_YMM (%YMM4, %YMM5)
> + VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
> + TOLOWER_YMM (%YMM6, %YMM7)
> + vpxorq %YMM4, %YMM5, %YMM5
> + vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> +# endif
> + VPTESTNM %YMM6, %YMM6, %k0{%k1}
> kmovd %k0, %LOOP_REG
> TESTEQ %LOOP_REG
> jnz L(return_vec_2_3_end)
> @@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
> # else
> movzbl VEC_OFFSET(%rdi, %rcx), %eax
> movzbl VEC_OFFSET(%rsi, %rcx), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -871,7 +1072,7 @@ L(page_cross):
> L(page_cross_loop):
> VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> kmovd %k1, %ecx
> TESTEQ %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -895,7 +1096,7 @@ L(page_cross_loop):
> */
> VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> VPTESTM %YMM0, %YMM0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> + CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>
> kmovd %k1, %ecx
> # ifdef USE_AS_STRNCMP
> @@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
> # else
> movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
> movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %ecx)
> subl %ecx, %eax
> xorl %r8d, %eax
> subl %r8d, %eax
> @@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
> /* Use 16 byte comparison. */
> vmovdqu (%rdi), %xmm0
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, (%rsi), %xmm0, %k1{%k2}
> + CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0xf, %ecx
> @@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
> # endif
> vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> + CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0xf, %ecx
> @@ -1048,7 +1251,7 @@ L(less_16_till_page):
> vmovq (%rdi), %xmm0
> vmovq (%rsi), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0x3, %ecx
> @@ -1068,7 +1271,7 @@ L(less_16_till_page):
> vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> # ifdef USE_AS_WCSCMP
> subl $0x3, %ecx
> @@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi), %xmm0
> vmovd (%rsi), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> subl $0xf, %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
> vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
> VPTESTM %xmm0, %xmm0, %k2
> - VPCMP $0, %xmm1, %xmm0, %k1{%k2}
> + CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
> kmovd %k1, %ecx
> subl $0xf, %ecx
> jnz L(check_ret_vec_page_cross)
> @@ -1176,7 +1379,9 @@ L(less_4_till_page):
> L(less_4_loop):
> movzbl (%rdi), %eax
> movzbl (%rsi, %rdi), %ecx
> - subl %ecx, %eax
> + TOLOWER_gpr (%rax, %eax)
> + TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> + subl %BYTE_LOOP_REG, %eax
> jnz L(ret_less_4_loop)
> testl %ecx, %ecx
> jz L(ret_zero_4_loop)
> @@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
> subl %r8d, %eax
> ret
> # endif
> -END(STRCMP)
> + cfi_endproc
> + .size STRCMP, .-STRCMP
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> new file mode 100644
> index 0000000000..b0808c1b21
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> @@ -0,0 +1,25 @@
> +/* strncasecmp_l optimized with EVEX.
> + Copyright (C) 2017-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef STRCMP
> +# define STRCMP __strncasecmp_l_evex
> +#endif
> +#define OVERFLOW_STRCMP __strcasecmp_evex
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#include "strcmp-evex.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
@@ -57,6 +57,7 @@ sysdep_routines += \
strcasecmp_l-avx \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
+ strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@ sysdep_routines += \
strncase_l-avx \
strncase_l-avx2 \
strncase_l-avx2-rtm \
+ strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
@@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_evex)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX2),
__strcasecmp_avx2)
@@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strcasecmp_l_evex)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX2),
__strcasecmp_l_avx2)
@@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_evex)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX2),
__strncasecmp_avx2)
@@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __strncasecmp_l_evex)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX2),
__strncasecmp_l_avx2)
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex);
+
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_rtm);
new file mode 100644
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
@@ -19,6 +19,9 @@
#if IS_IN (libc)
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
# ifndef STRCMP
# define STRCMP __strcmp_evex
@@ -34,19 +37,29 @@
# define VMOVA vmovdqa64
# ifdef USE_AS_WCSCMP
-# define TESTEQ subl $0xff,
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __wcscmp_evex
+# endif
+
+# define TESTEQ subl $0xff,
/* Compare packed dwords. */
# define VPCMP vpcmpd
# define VPMINU vpminud
# define VPTESTM vptestmd
+# define VPTESTNM vptestnmd
/* 1 dword char == 4 bytes. */
# define SIZE_OF_CHAR 4
# else
+# ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcmp_evex
+# endif
+
# define TESTEQ incl
/* Compare packed bytes. */
# define VPCMP vpcmpb
# define VPMINU vpminub
# define VPTESTM vptestmb
+# define VPTESTNM vptestnmb
/* 1 byte char == 1 byte. */
# define SIZE_OF_CHAR 1
# endif
@@ -73,11 +86,16 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
-# define XMMZERO xmm16
# define XMM0 xmm17
# define XMM1 xmm18
-# define YMMZERO ymm16
+# define XMM10 xmm27
+# define XMM11 xmm28
+# define XMM12 xmm29
+# define XMM13 xmm30
+# define XMM14 xmm31
+
+
# define YMM0 ymm17
# define YMM1 ymm18
# define YMM2 ymm19
@@ -89,6 +107,87 @@
# define YMM8 ymm25
# define YMM9 ymm26
# define YMM10 ymm27
+# define YMM11 ymm28
+# define YMM12 ymm29
+# define YMM13 ymm30
+# define YMM14 ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_evex
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_evex
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
+# define LCASE_MIN_YMM %YMM12
+# define LCASE_MAX_YMM %YMM13
+# define CASE_ADD_YMM %YMM14
+
+# define LCASE_MIN_XMM %XMM12
+# define LCASE_MAX_XMM %XMM13
+# define CASE_ADD_XMM %XMM14
+
+ /* NB: wcsncmp uses r11 but strcasecmp is never used in
+ conjunction with wcscmp. */
+# define TOLOWER_BASE %r11
+
+# ifdef USE_AS_STRCASECMP_L
+# define _REG(x, y) x ## y
+# define REG(x, y) _REG(x, y)
+# define TOLOWER(reg1, reg2, ext) \
+ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
+ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
+ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
+ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
+# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
+
+# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
+ TOLOWER (s1_reg, s2_reg, ext); \
+ VPCMP $0, s1_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
+ VMOVU s2_mem, s2_reg; \
+ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_YMM(...)
+# define TOLOWER_XMM(...)
+
+# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
+ VPCMP $0, s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
+ VPCMP $0, s2_mem, s1_reg, reg_out
+
+# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,7 +211,41 @@
returned. */
.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (STRCASECMP)
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
# ifdef __ILP32__
/* Clear the upper 32 bits. */
@@ -125,6 +258,32 @@ ENTRY(STRCMP)
actually bound the buffer. */
jle L(one_or_less)
# endif
+
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+ .quad 0x4141414141414141
+L(lcase_max):
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+ .quad 0x1a1a1a1a1a1a1a1a
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
movl %edi, %eax
orl %esi, %eax
/* Shift out the bits irrelivant to page boundary ([63:12]). */
@@ -139,7 +298,7 @@ L(no_page_cross):
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi). */
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
cmpq $CHAR_PER_VEC, %rdx
@@ -169,6 +328,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -192,7 +353,7 @@ L(one_or_less):
# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __wcscmp_evex
+ jnbe OVERFLOW_STRCMP
movl (%rdi), %edx
xorl %eax, %eax
cmpl (%rsi), %edx
@@ -203,9 +364,11 @@ L(one_or_less):
# else
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __strcmp_evex
+ jnbe OVERFLOW_STRCMP
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -233,6 +396,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -270,6 +435,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -290,6 +457,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -303,7 +472,7 @@ L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU (VEC_SIZE)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1)
@@ -315,14 +484,14 @@ L(more_3x_vec):
VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_3)
@@ -381,7 +550,6 @@ L(prepare_loop_aligned):
subl %esi, %eax
andl $(PAGE_SIZE - 1), %eax
- vpxorq %YMMZERO, %YMMZERO, %YMMZERO
/* Loop 4x comparisons at a time. */
.p2align 4
@@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
/* A zero CHAR in YMM9 means that there is a null CHAR. */
VPMINU %YMM8, %YMM9, %YMM9
- /* Each bit set in K1 represents a non-null CHAR in YMM8. */
+ /* Each bit set in K1 represents a non-null CHAR in YMM9. */
VPTESTM %YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
oring with YMM1. Result is stored in YMM6. */
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
+ TOLOWER_YMM (%YMM0, %YMM1)
+ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
+ TOLOWER_YMM (%YMM2, %YMM3)
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM0, %YMM1, %YMM1
+ vpxorq %YMM2, %YMM3, %YMM3
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
/* Or together YMM3, YMM5, and YMM6. */
vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
/* A non-zero CHAR in YMM6 represents a mismatch. */
- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
@@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
/* Find which VEC has the mismatch of end of string. */
VPTESTM %YMM0, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
+ VPTESTNM %YMM1, %YMM1, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
VPTESTM %YMM2, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
+ VPTESTNM %YMM3, %YMM3, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -457,7 +638,7 @@ L(return_vec_2_3_end):
# endif
VPTESTM %YMM4, %YMM4, %k1
- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
+ VPTESTNM %YMM5, %YMM5, %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
# if CHAR_PER_VEC <= 16
@@ -493,6 +674,8 @@ L(return_vec_3_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -545,6 +728,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
logic. Subtract `r8d` after xor for zero case. */
@@ -569,6 +754,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -598,7 +785,7 @@ L(page_cross_during_loop):
VMOVA (%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
@@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
/* Mask of potentially valid bits. The lower bits can be out of
range comparisons (but safe regarding page crosses). */
@@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
# ifdef USE_AS_STRNCMP
# ifdef USE_AS_WCSCMP
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
movl %eax, %r11d
shrl $2, %r11d
cmpq %r11, %rdx
@@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
VMOVA VEC_SIZE(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_1)
@@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
/* Must check length here as length might proclude reading next
page. */
# ifdef USE_AS_WCSCMP
+ /* NB: strcasecmp not used with WCSCMP so this access to r11 is
+ safe. */
movl %eax, %r11d
shrl $2, %r11d
cmpq %r11, %rdx
@@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
VPMINU %YMM4, %YMM6, %YMM9
VPTESTM %YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
+ TOLOWER_YMM (%YMM4, %YMM5)
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
+ TOLOWER_YMM (%YMM6, %YMM7)
+ vpxorq %YMM4, %YMM5, %YMM5
+ vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+ VPTESTNM %YMM6, %YMM6, %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
jnz L(return_vec_2_3_end)
@@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -871,7 +1072,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(check_ret_vec_page_cross)
@@ -895,7 +1096,7 @@ L(page_cross_loop):
*/
VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
@@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
/* Use 16 byte comparison. */
vmovdqu (%rdi), %xmm0
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, (%rsi), %xmm0, %k1{%k2}
+ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0xf, %ecx
@@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
# endif
vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0xf, %ecx
@@ -1048,7 +1251,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0x3, %ecx
@@ -1068,7 +1271,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_WCSCMP
subl $0x3, %ecx
@@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
subl $0xf, %ecx
jnz L(check_ret_vec_page_cross)
@@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
VPTESTM %xmm0, %xmm0, %k2
- VPCMP $0, %xmm1, %xmm0, %k1{%k2}
+ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
kmovd %k1, %ecx
subl $0xf, %ecx
jnz L(check_ret_vec_page_cross)
@@ -1176,7 +1379,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
new file mode 100644
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP __strcasecmp_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"