[v1,22/23] x86: Add EVEX optimized str{n}casecmp

Message ID 20220323215734.3927131-22-goldstein.w.n@gmail.com
State Superseded, archived
Headers
Series [v1,01/23] benchtests: Use json-lib in bench-strchr.c |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein March 23, 2022, 9:57 p.m. UTC
  geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, EVEX Time / SSE42 Time
     1,      1,      1,      127,                  0.871
     2,      2,      2,      127,                  0.833
     3,      3,      3,      127,                  0.851
     4,      4,      4,      127,                  0.824
     5,      5,      5,      127,                  0.791
     6,      6,      6,      127,                  0.789
     7,      7,      7,      127,                  0.804
     8,      0,      0,      127,                  0.838
     9,      1,      1,      127,                  0.837
    10,      2,      2,      127,                  0.834
    11,      3,      3,      127,                  0.839
    12,      4,      4,      127,                  0.844
    13,      5,      5,      127,                  0.796
    14,      6,      6,      127,                  0.811
    15,      7,      7,      127,                  0.838
     4,      0,      0,      127,                   0.84
     4,      0,      0,      254,                  0.823
     8,      0,      0,      254,                  0.838
    16,      0,      0,      127,                  0.669
    16,      0,      0,      254,                  0.656
    32,      0,      0,      127,                  0.488
    32,      0,      0,      254,                  0.484
    64,      0,      0,      127,                  0.492
    64,      0,      0,      254,                  0.502
   128,      0,      0,      127,                  0.508
   128,      0,      0,      254,                  0.497
   256,      0,      0,      127,                  0.574
   256,      0,      0,      254,                  0.581
   512,      0,      0,      127,                  0.573
   512,      0,      0,      254,                  0.577
  1024,      0,      0,      127,                  0.489
  1024,      0,      0,      254,                  0.485
    16,      1,      2,      127,                  0.655
    16,      2,      1,      254,                  0.646
    32,      2,      4,      127,                  0.368
    32,      4,      2,      254,                  0.376
    64,      3,      6,      127,                  0.428
    64,      6,      3,      254,                  0.426
   128,      4,      0,      127,                  0.478
   128,      0,      4,      254,                  0.473
   256,      5,      2,      127,                   0.65
   256,      2,      5,      254,                  0.654
   512,      6,      4,      127,                  0.492
   512,      4,      6,      254,                  0.489
  1024,      7,      6,      127,                  0.463
  1024,      6,      7,      254,                  0.457

 sysdeps/x86_64/multiarch/Makefile            |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 ++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
 sysdeps/x86_64/multiarch/strcmp-evex.S       | 280 ++++++++++++++++---
 sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
 6 files changed, 314 insertions(+), 37 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
  

Comments

H.J. Lu March 24, 2022, 7:04 p.m. UTC | #1
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, EVEX Time / SSE42 Time
>      1,      1,      1,      127,                  0.871
>      2,      2,      2,      127,                  0.833
>      3,      3,      3,      127,                  0.851
>      4,      4,      4,      127,                  0.824
>      5,      5,      5,      127,                  0.791
>      6,      6,      6,      127,                  0.789
>      7,      7,      7,      127,                  0.804
>      8,      0,      0,      127,                  0.838
>      9,      1,      1,      127,                  0.837
>     10,      2,      2,      127,                  0.834
>     11,      3,      3,      127,                  0.839
>     12,      4,      4,      127,                  0.844
>     13,      5,      5,      127,                  0.796
>     14,      6,      6,      127,                  0.811
>     15,      7,      7,      127,                  0.838
>      4,      0,      0,      127,                   0.84
>      4,      0,      0,      254,                  0.823
>      8,      0,      0,      254,                  0.838
>     16,      0,      0,      127,                  0.669
>     16,      0,      0,      254,                  0.656
>     32,      0,      0,      127,                  0.488
>     32,      0,      0,      254,                  0.484
>     64,      0,      0,      127,                  0.492
>     64,      0,      0,      254,                  0.502
>    128,      0,      0,      127,                  0.508
>    128,      0,      0,      254,                  0.497
>    256,      0,      0,      127,                  0.574
>    256,      0,      0,      254,                  0.581
>    512,      0,      0,      127,                  0.573
>    512,      0,      0,      254,                  0.577
>   1024,      0,      0,      127,                  0.489
>   1024,      0,      0,      254,                  0.485
>     16,      1,      2,      127,                  0.655
>     16,      2,      1,      254,                  0.646
>     32,      2,      4,      127,                  0.368
>     32,      4,      2,      254,                  0.376
>     64,      3,      6,      127,                  0.428
>     64,      6,      3,      254,                  0.426
>    128,      4,      0,      127,                  0.478
>    128,      0,      4,      254,                  0.473
>    256,      5,      2,      127,                   0.65
>    256,      2,      5,      254,                  0.654
>    512,      6,      4,      127,                  0.492
>    512,      4,      6,      254,                  0.489
>   1024,      7,      6,      127,                  0.463
>   1024,      6,      7,      254,                  0.457
>
>  sysdeps/x86_64/multiarch/Makefile            |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 ++
>  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
>  sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
>  sysdeps/x86_64/multiarch/strcmp-evex.S       | 280 ++++++++++++++++---
>  sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
>  6 files changed, 314 insertions(+), 37 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
>  create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 06e1848823..35d80dc2ff 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -57,6 +57,7 @@ sysdep_routines += \
>    strcasecmp_l-avx \
>    strcasecmp_l-avx2 \
>    strcasecmp_l-avx2-rtm \
> +  strcasecmp_l-evex \
>    strcasecmp_l-sse2 \
>    strcasecmp_l-sse4_2 \
>    strcasecmp_l-ssse3 \
> @@ -97,6 +98,7 @@ sysdep_routines += \
>    strncase_l-avx \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
> +  strncase_l-evex \
>    strncase_l-sse2 \
>    strncase_l-sse4_2 \
>    strncase_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 3c556d07ac..f1a4d3dac2 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strcasecmp_evex)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strcasecmp_avx2)
> @@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
>    IFUNC_IMPL (i, name, strcasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strcasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strcasecmp_l_evex)
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strcasecmp_l_avx2)
> @@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strncasecmp_evex)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strncasecmp_avx2)
> @@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
>    IFUNC_IMPL (i, name, strncasecmp_l,
> +             IFUNC_IMPL_ADD (array, i, strncasecmp,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)),
> +                             __strncasecmp_l_evex)
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (AVX2),
>                               __strncasecmp_l_avx2)
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index c4de111fd0..bf0d146e7f 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>
>  static inline void *
>  IFUNC_SELECTOR (void)
> @@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
>        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
>      {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +        return OPTIMIZE (evex);
> +
>        if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
>          return OPTIMIZE (avx2_rtm);
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> new file mode 100644
> index 0000000000..58642db748
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
> @@ -0,0 +1,23 @@
> +/* strcasecmp_l optimized with EVEX.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strcasecmp_l_evex
> +#endif
> +#define USE_AS_STRCASECMP_L
> +#include "strcmp-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index 56d8c118e4..85afd6535f 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -19,6 +19,9 @@
>  #if IS_IN (libc)
>
>  # include <sysdep.h>
> +# if defined USE_AS_STRCASECMP_L
> +#  include "locale-defines.h"
> +# endif
>
>  # ifndef STRCMP
>  #  define STRCMP       __strcmp_evex
> @@ -34,19 +37,29 @@
>  # define VMOVA vmovdqa64
>
>  # ifdef USE_AS_WCSCMP
> -#  define TESTEQ       subl    $0xff,
> +#  ifndef OVERFLOW_STRCMP
> +#   define OVERFLOW_STRCMP     __wcscmp_evex
> +#  endif
> +
> +#  define TESTEQ       subl $0xff,
>         /* Compare packed dwords.  */
>  #  define VPCMP        vpcmpd
>  #  define VPMINU       vpminud
>  #  define VPTESTM      vptestmd
> +#  define VPTESTNM     vptestnmd
>         /* 1 dword char == 4 bytes.  */
>  #  define SIZE_OF_CHAR 4
>  # else
> +#  ifndef OVERFLOW_STRCMP
> +#   define OVERFLOW_STRCMP     __strcmp_evex
> +#  endif
> +
>  #  define TESTEQ       incl
>         /* Compare packed bytes.  */
>  #  define VPCMP        vpcmpb
>  #  define VPMINU       vpminub
>  #  define VPTESTM      vptestmb
> +#  define VPTESTNM     vptestnmb
>         /* 1 byte char == 1 byte.  */
>  #  define SIZE_OF_CHAR 1
>  # endif
> @@ -73,11 +86,16 @@
>  #  define VEC_OFFSET   (-VEC_SIZE)
>  # endif
>
> -# define XMMZERO       xmm16
>  # define XMM0  xmm17
>  # define XMM1  xmm18
>
> -# define YMMZERO       ymm16
> +# define XMM10 xmm27
> +# define XMM11 xmm28
> +# define XMM12 xmm29
> +# define XMM13 xmm30
> +# define XMM14 xmm31
> +
> +
>  # define YMM0  ymm17
>  # define YMM1  ymm18
>  # define YMM2  ymm19
> @@ -89,6 +107,87 @@
>  # define YMM8  ymm25
>  # define YMM9  ymm26
>  # define YMM10 ymm27
> +# define YMM11 ymm28
> +# define YMM12 ymm29
> +# define YMM13 ymm30
> +# define YMM14 ymm31
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  define BYTE_LOOP_REG        OFFSET_REG
> +# else
> +#  define BYTE_LOOP_REG        ecx
> +# endif
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  ifdef USE_AS_STRNCMP
> +#   define STRCASECMP  __strncasecmp_evex
> +#   define LOCALE_REG  rcx
> +#   define LOCALE_REG_LP       RCX_LP
> +#   define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +#  else
> +#   define STRCASECMP  __strcasecmp_evex
> +#   define LOCALE_REG  rdx
> +#   define LOCALE_REG_LP       RDX_LP
> +#   define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +#  endif
> +# endif
> +
> +# define LCASE_MIN_YMM %YMM12
> +# define LCASE_MAX_YMM %YMM13
> +# define CASE_ADD_YMM  %YMM14
> +
> +# define LCASE_MIN_XMM %XMM12
> +# define LCASE_MAX_XMM %XMM13
> +# define CASE_ADD_XMM  %XMM14
> +
> +       /* NB: wcsncmp uses r11 but strcasecmp is never used in
> +          conjunction with wcscmp.  */
> +# define TOLOWER_BASE  %r11
> +
> +# ifdef USE_AS_STRCASECMP_L
> +#  define _REG(x, y) x ## y
> +#  define REG(x, y) _REG(x, y)
> +#  define TOLOWER(reg1, reg2, ext)                                                                             \
> +       vpsubb  REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);                                      \
> +       vpsubb  REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);                                      \
> +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;                           \
> +       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;                           \
> +       vpaddb  reg1, REG(CASE_ADD_, ext), reg1{%k5};                                           \
> +       vpaddb  reg2, REG(CASE_ADD_, ext), reg2{%k6}
> +
> +#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> +#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM)
> +#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM)
> +
> +#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)                                              \
> +       TOLOWER (s1_reg, s2_reg, ext);                                                                          \
> +       VPCMP   $0, s1_reg, s2_reg, reg_out
> +
> +#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)                              \
> +       VMOVU   s2_mem, s2_reg;                                                                                         \
> +       CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> +
> +#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> +#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> +
> +#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> +#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> +
> +# else
> +#  define TOLOWER_gpr(...)
> +#  define TOLOWER_YMM(...)
> +#  define TOLOWER_XMM(...)
> +
> +#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)                                               \
> +       VPCMP   $0, s2_reg, s1_reg, reg_out
> +
> +#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> +
> +#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)                               \
> +       VPCMP   $0, s2_mem, s1_reg, reg_out
> +
> +#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> +# endif
>
>  /* Warning!
>             wcscmp/wcsncmp have to use SIGNED comparison for elements.
> @@ -112,7 +211,41 @@
>     returned.  */
>
>         .section .text.evex, "ax", @progbits
> -ENTRY(STRCMP)
> +       .align  16
> +       .type   STRCMP, @function
> +       .globl  STRCMP
> +       .hidden STRCMP
> +
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (STRCASECMP)
> +       movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
> +       mov     %fs:(%rax), %LOCALE_REG_LP
> +
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
> +END (STRCASECMP)
> +       /* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
> +# endif
> +
> +       .p2align 4
> +STRCMP:
> +       cfi_startproc
> +       _CET_ENDBR
> +       CALL_MCOUNT
> +
> +# if defined USE_AS_STRCASECMP_L
> +       /* We have to fall back on the C implementation for locales with
> +          encodings not matching ASCII for single bytes.  */
> +#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> +       mov     LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
> +#  else
> +       mov     (%LOCALE_REG), %RAX_LP
> +#  endif
> +       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> +       jne     STRCASECMP_NONASCII
> +       leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> +# endif
> +
>  # ifdef USE_AS_STRNCMP
>  #  ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
> @@ -125,6 +258,32 @@ ENTRY(STRCMP)
>            actually bound the buffer.  */
>         jle     L(one_or_less)
>  # endif
> +
> +# if defined USE_AS_STRCASECMP_L
> +       .section .rodata.cst32, "aM", @progbits, 32
> +       .align  32
> +L(lcase_min):
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +L(lcase_max):
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +L(case_add):
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .previous
> +
> +       vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> +       vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> +       vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> +# endif
> +
>         movl    %edi, %eax
>         orl     %esi, %eax
>         /* Shift out the bits irrelivant to page boundary ([63:12]).  */
> @@ -139,7 +298,7 @@ L(no_page_cross):
>         VPTESTM %YMM0, %YMM0, %k2
>         /* Each bit cleared in K1 represents a mismatch or a null CHAR
>            in YMM0 and 32 bytes at (%rsi).  */
> -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_STRNCMP
>         cmpq    $CHAR_PER_VEC, %rdx
> @@ -169,6 +328,8 @@ L(return_vec_0):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret0):
> @@ -192,7 +353,7 @@ L(one_or_less):
>  #  ifdef USE_AS_WCSCMP
>         /* 'nbe' covers the case where length is negative (large
>            unsigned).  */
> -       jnbe    __wcscmp_evex
> +       jnbe    OVERFLOW_STRCMP
>         movl    (%rdi), %edx
>         xorl    %eax, %eax
>         cmpl    (%rsi), %edx
> @@ -203,9 +364,11 @@ L(one_or_less):
>  #  else
>         /* 'nbe' covers the case where length is negative (large
>            unsigned).  */
> -       jnbe    __strcmp_evex
> +       jnbe    OVERFLOW_STRCMP
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret1):
> @@ -233,6 +396,8 @@ L(return_vec_1):
>  # else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret2):
> @@ -270,6 +435,8 @@ L(return_vec_2):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  # endif
>  L(ret3):
> @@ -290,6 +457,8 @@ L(return_vec_3):
>  #  else
>         movzbl  (VEC_SIZE * 3)(%rdi, %rcx), %eax
>         movzbl  (VEC_SIZE * 3)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>  #  endif
>  L(ret4):
> @@ -303,7 +472,7 @@ L(more_3x_vec):
>         /* Safe to compare 4x vectors.  */
>         VMOVU   (VEC_SIZE)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1)
> @@ -315,14 +484,14 @@ L(more_3x_vec):
>
>         VMOVU   (VEC_SIZE * 2)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_2)
>
>         VMOVU   (VEC_SIZE * 3)(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_3)
> @@ -381,7 +550,6 @@ L(prepare_loop_aligned):
>         subl    %esi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>
> -       vpxorq  %YMMZERO, %YMMZERO, %YMMZERO
>
>         /* Loop 4x comparisons at a time.  */
>         .p2align 4
> @@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
>         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
>         VPMINU  %YMM8, %YMM9, %YMM9
>
> -       /* Each bit set in K1 represents a non-null CHAR in YMM8.  */
> +       /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
>         VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
>         vpxorq  (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
>         vpxorq  (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
>         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
>         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
>            oring with YMM1. Result is stored in YMM6.  */
>         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> -
> +# else
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %YMM1
> +       TOLOWER_YMM (%YMM0, %YMM1)
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %YMM3
> +       TOLOWER_YMM (%YMM2, %YMM3)
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> +       TOLOWER_YMM (%YMM4, %YMM5)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> +       TOLOWER_YMM (%YMM6, %YMM7)
> +       vpxorq  %YMM0, %YMM1, %YMM1
> +       vpxorq  %YMM2, %YMM3, %YMM3
> +       vpxorq  %YMM4, %YMM5, %YMM5
> +       vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> +# endif
>         /* Or together YMM3, YMM5, and YMM6.  */
>         vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
>
>
>         /* A non-zero CHAR in YMM6 represents a mismatch.  */
> -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
>         kmovd   %k0, %LOOP_REG
>
>         TESTEQ  %LOOP_REG
> @@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
>
>         /* Find which VEC has the mismatch of end of string.  */
>         VPTESTM %YMM0, %YMM0, %k1
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0{%k1}
> +       VPTESTNM %YMM1, %YMM1, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_0_end)
>
>         VPTESTM %YMM2, %YMM2, %k1
> -       VPCMP   $0, %YMMZERO, %YMM3, %k0{%k1}
> +       VPTESTNM %YMM3, %YMM3, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1_end)
> @@ -457,7 +638,7 @@ L(return_vec_2_3_end):
>  # endif
>
>         VPTESTM %YMM4, %YMM4, %k1
> -       VPCMP   $0, %YMMZERO, %YMM5, %k0{%k1}
> +       VPTESTNM %YMM5, %YMM5, %k0{%k1}
>         kmovd   %k0, %ecx
>         TESTEQ  %ecx
>  # if CHAR_PER_VEC <= 16
> @@ -493,6 +674,8 @@ L(return_vec_3_end):
>  # else
>         movzbl  (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
>         movzbl  (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -545,6 +728,8 @@ L(return_vec_0_end):
>  # else
>         movzbl  (%rdi, %rcx), %eax
>         movzbl  (%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
>            logic. Subtract `r8d` after xor for zero case.  */
> @@ -569,6 +754,8 @@ L(return_vec_1_end):
>  #  else
>         movzbl  VEC_SIZE(%rdi, %rcx), %eax
>         movzbl  VEC_SIZE(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -598,7 +785,7 @@ L(page_cross_during_loop):
>
>         VMOVA   (%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_0_end)
> @@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
>            been loaded earlier so must be valid.  */
>         VMOVU   -VEC_SIZE(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
> -
> +       CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
>         /* Mask of potentially valid bits. The lower bits can be out of
>            range comparisons (but safe regarding page crosses).  */
>
> @@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
>
>  # ifdef USE_AS_STRNCMP
>  #  ifdef USE_AS_WCSCMP
> +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> +          safe.  */
>         movl    %eax, %r11d
>         shrl    $2, %r11d
>         cmpq    %r11, %rdx
> @@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
>
>         VMOVA   VEC_SIZE(%rdi), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_1_end)
> @@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
>         /* Safe to include comparisons from lower bytes.  */
>         VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_page_cross_0)
>
>         VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(return_vec_page_cross_1)
> @@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
>         /* Must check length here as length might proclude reading next
>            page.  */
>  #  ifdef USE_AS_WCSCMP
> +       /* NB: strcasecmp not used with WCSCMP so this access to r11 is
> +          safe.  */
>         movl    %eax, %r11d
>         shrl    $2, %r11d
>         cmpq    %r11, %rdx
> @@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
>         VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
>         VPMINU  %YMM4, %YMM6, %YMM9
>         VPTESTM %YMM9, %YMM9, %k1
> -
> +# ifndef USE_AS_STRCASECMP_L
>         vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
>         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
>         vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> -
> -       VPCMP   $0, %YMMZERO, %YMM6, %k0{%k1}
> +# else
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> +       TOLOWER_YMM (%YMM4, %YMM5)
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> +       TOLOWER_YMM (%YMM6, %YMM7)
> +       vpxorq  %YMM4, %YMM5, %YMM5
> +       vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> +# endif
> +       VPTESTNM %YMM6, %YMM6, %k0{%k1}
>         kmovd   %k0, %LOOP_REG
>         TESTEQ  %LOOP_REG
>         jnz     L(return_vec_2_3_end)
> @@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
>  # else
>         movzbl  VEC_OFFSET(%rdi, %rcx), %eax
>         movzbl  VEC_OFFSET(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -871,7 +1072,7 @@ L(page_cross):
>  L(page_cross_loop):
>         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>         kmovd   %k1, %ecx
>         TESTEQ  %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -895,7 +1096,7 @@ L(page_cross_loop):
>          */
>         VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
>         VPTESTM %YMM0, %YMM0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
> +       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
>
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_STRNCMP
> @@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
>  # else
>         movzbl  (%rdi, %rcx, SIZE_OF_CHAR), %eax
>         movzbl  (%rsi, %rcx, SIZE_OF_CHAR), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> @@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
>         /* Use 16 byte comparison.  */
>         vmovdqu (%rdi), %xmm0
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, (%rsi), %xmm0, %k1{%k2}
> +       CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0xf, %ecx
> @@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
>  # endif
>         vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
> +       CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0xf, %ecx
> @@ -1048,7 +1251,7 @@ L(less_16_till_page):
>         vmovq   (%rdi), %xmm0
>         vmovq   (%rsi), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0x3, %ecx
> @@ -1068,7 +1271,7 @@ L(less_16_till_page):
>         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         vmovq   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>  # ifdef USE_AS_WCSCMP
>         subl    $0x3, %ecx
> @@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi), %xmm0
>         vmovd   (%rsi), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>         subl    $0xf, %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
>         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
>         vmovd   (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
>         VPTESTM %xmm0, %xmm0, %k2
> -       VPCMP   $0, %xmm1, %xmm0, %k1{%k2}
> +       CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
>         kmovd   %k1, %ecx
>         subl    $0xf, %ecx
>         jnz     L(check_ret_vec_page_cross)
> @@ -1176,7 +1379,9 @@ L(less_4_till_page):
>  L(less_4_loop):
>         movzbl  (%rdi), %eax
>         movzbl  (%rsi, %rdi), %ecx
> -       subl    %ecx, %eax
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
> +       subl    %BYTE_LOOP_REG, %eax
>         jnz     L(ret_less_4_loop)
>         testl   %ecx, %ecx
>         jz      L(ret_zero_4_loop)
> @@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
>         subl    %r8d, %eax
>         ret
>  # endif
> -END(STRCMP)
> +       cfi_endproc
> +       .size   STRCMP, .-STRCMP
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> new file mode 100644
> index 0000000000..b0808c1b21
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
> @@ -0,0 +1,25 @@
> +/* strncasecmp_l optimized with EVEX.
> +   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRCMP
> +# define STRCMP        __strncasecmp_l_evex
> +#endif
> +#define OVERFLOW_STRCMP        __strcasecmp_evex
> +#define USE_AS_STRCASECMP_L
> +#define USE_AS_STRNCMP
> +#include "strcmp-evex.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06e1848823..35d80dc2ff 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -57,6 +57,7 @@  sysdep_routines += \
   strcasecmp_l-avx \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
+  strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@  sysdep_routines += \
   strncase_l-avx \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
+  strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3c556d07ac..f1a4d3dac2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_avx2)
@@ -456,6 +460,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_l_avx2)
@@ -590,6 +598,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_avx2)
@@ -611,6 +623,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_l_avx2)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index c4de111fd0..bf0d146e7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -25,6 +25,7 @@  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@  IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+        return OPTIMIZE (evex);
+
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
         return OPTIMIZE (avx2_rtm);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 0000000000..58642db748
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@ 
+/* strcasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 56d8c118e4..85afd6535f 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@ 
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
 
 # ifndef STRCMP
 #  define STRCMP	__strcmp_evex
@@ -34,19 +37,29 @@ 
 # define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl	$0xff,
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__wcscmp_evex
+#  endif
+
+#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
+#  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
 # else
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__strcmp_evex
+#  endif
+
 #  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
+#  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
 # endif
@@ -73,11 +86,16 @@ 
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMMZERO	xmm16
 # define XMM0	xmm17
 # define XMM1	xmm18
 
-# define YMMZERO	ymm16
+# define XMM10	xmm27
+# define XMM11	xmm28
+# define XMM12	xmm29
+# define XMM13	xmm30
+# define XMM14	xmm31
+
+
 # define YMM0	ymm17
 # define YMM1	ymm18
 # define YMM2	ymm19
@@ -89,6 +107,87 @@ 
 # define YMM8	ymm25
 # define YMM9	ymm26
 # define YMM10	ymm27
+# define YMM11	ymm28
+# define YMM12	ymm29
+# define YMM13	ymm30
+# define YMM14	ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_evex
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_evex
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define LCASE_MIN_YMM	%YMM12
+# define LCASE_MAX_YMM	%YMM13
+# define CASE_ADD_YMM	%YMM14
+
+# define LCASE_MIN_XMM	%XMM12
+# define LCASE_MAX_XMM	%XMM13
+# define CASE_ADD_XMM	%XMM14
+
+	/* NB: wcsncmp uses r11 but strcasecmp is never used in
+	   conjunction with wcscmp.  */
+# define TOLOWER_BASE	%r11
+
+# ifdef USE_AS_STRCASECMP_L
+#  define _REG(x, y) x ## y
+#  define REG(x, y) _REG(x, y)
+#  define TOLOWER(reg1, reg2, ext)										\
+	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
+	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
+	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
+	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
+	TOLOWER	(s1_reg, s2_reg, ext);										\
+	VPCMP	$0, s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
+	VMOVU	s2_mem, s2_reg;												\
+	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_YMM(...)
+#  define TOLOWER_XMM(...)
+
+#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
+	VPCMP	$0, s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
+	VPCMP	$0, s2_mem, s1_reg, reg_out
+
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
 
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,7 +211,41 @@ 
    returned.  */
 
 	.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (STRCASECMP)
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
@@ -125,6 +258,32 @@  ENTRY(STRCMP)
 	   actually bound the buffer.  */
 	jle	L(one_or_less)
 # endif
+
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+L(lcase_max):
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
 	movl	%edi, %eax
 	orl	%esi, %eax
 	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
@@ -139,7 +298,7 @@  L(no_page_cross):
 	VPTESTM	%YMM0, %YMM0, %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
@@ -169,6 +328,8 @@  L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -192,7 +353,7 @@  L(one_or_less):
 #  ifdef USE_AS_WCSCMP
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
-	jnbe	__wcscmp_evex
+	jnbe	OVERFLOW_STRCMP
 	movl	(%rdi), %edx
 	xorl	%eax, %eax
 	cmpl	(%rsi), %edx
@@ -203,9 +364,11 @@  L(one_or_less):
 #  else
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
-	jnbe	__strcmp_evex
+	jnbe	OVERFLOW_STRCMP
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -233,6 +396,8 @@  L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -270,6 +435,8 @@  L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -290,6 +457,8 @@  L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -303,7 +472,7 @@  L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(VEC_SIZE)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1)
@@ -315,14 +484,14 @@  L(more_3x_vec):
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_3)
@@ -381,7 +550,6 @@  L(prepare_loop_aligned):
 	subl	%esi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 
-	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
 
 	/* Loop 4x comparisons at a time.  */
 	.p2align 4
@@ -413,22 +581,35 @@  L(loop_skip_page_cross_check):
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
 	VPMINU	%YMM8, %YMM9, %YMM9
 
-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
 	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
+	TOLOWER_YMM (%YMM0, %YMM1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
+	TOLOWER_YMM (%YMM2, %YMM3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM0, %YMM1, %YMM1
+	vpxorq	%YMM2, %YMM3, %YMM3
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
 	/* Or together YMM3, YMM5, and YMM6.  */
 	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
@@ -437,13 +618,13 @@  L(loop_skip_page_cross_check):
 
 	/* Find which VEC has the mismatch of end of string.  */
 	VPTESTM	%YMM0, %YMM0, %k1
-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+	VPTESTNM %YMM1, %YMM1, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
 
 	VPTESTM	%YMM2, %YMM2, %k1
-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+	VPTESTNM %YMM3, %YMM3, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -457,7 +638,7 @@  L(return_vec_2_3_end):
 # endif
 
 	VPTESTM	%YMM4, %YMM4, %k1
-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+	VPTESTNM %YMM5, %YMM5, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 # if CHAR_PER_VEC <= 16
@@ -493,6 +674,8 @@  L(return_vec_3_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -545,6 +728,8 @@  L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
 	   logic. Subtract `r8d` after xor for zero case.  */
@@ -569,6 +754,8 @@  L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -598,7 +785,7 @@  L(page_cross_during_loop):
 
 	VMOVA	(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
@@ -619,8 +806,7 @@  L(less_1x_vec_till_page_cross):
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -642,6 +828,8 @@  L(less_1x_vec_till_page_cross):
 
 # ifdef USE_AS_STRNCMP
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -679,6 +867,8 @@  L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -709,7 +899,7 @@  L(more_2x_vec_till_page_cross):
 
 	VMOVA	VEC_SIZE(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -724,14 +914,14 @@  L(more_2x_vec_till_page_cross):
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_1)
@@ -740,6 +930,8 @@  L(more_2x_vec_till_page_cross):
 	/* Must check length here as length might proclude reading next
 	   page.  */
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -754,12 +946,19 @@  L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
 	VPMINU	%YMM4, %YMM6, %YMM9
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
@@ -815,6 +1014,8 @@  L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -871,7 +1072,7 @@  L(page_cross):
 L(page_cross_loop):
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -895,7 +1096,7 @@  L(page_cross_loop):
 	 */
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
@@ -930,6 +1131,8 @@  L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
 	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -989,7 +1192,7 @@  L(less_1x_vec_till_page):
 	/* Use 16 byte comparison.  */
 	vmovdqu	(%rdi), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1009,7 +1212,7 @@  L(less_1x_vec_till_page):
 # endif
 	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1048,7 +1251,7 @@  L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1068,7 +1271,7 @@  L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1128,7 +1331,7 @@  L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1143,7 +1346,7 @@  L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1176,7 +1379,9 @@  L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1203,5 +1408,6 @@  L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 0000000000..b0808c1b21
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@ 
+/* strncasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP	__strcasecmp_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"