[v1,18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S

Message ID 20220323215734.3927131-18-goldstein.w.n@gmail.com
State Accepted, archived
Headers
Series [v1,01/23] benchtests: Use json-lib in bench-strchr.c |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein March 23, 2022, 9:57 p.m. UTC
  Slightly faster method of doing TOLOWER that saves an
instruction.

Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.

geometric_mean(N=40) of all benchmarks New / Original: .920

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
     1,      1,      1,      127,               0.914
     2,      2,      2,      127,               0.952
     3,      3,      3,      127,               0.924
     4,      4,      4,      127,               0.995
     5,      5,      5,      127,               0.985
     6,      6,      6,      127,               1.017
     7,      7,      7,      127,               1.031
     8,      0,      0,      127,               0.967
     9,      1,      1,      127,               0.969
    10,      2,      2,      127,               0.951
    11,      3,      3,      127,               0.938
    12,      4,      4,      127,               0.937
    13,      5,      5,      127,               0.967
    14,      6,      6,      127,               0.941
    15,      7,      7,      127,               0.951
     4,      0,      0,      127,               0.959
     4,      0,      0,      254,                0.98
     8,      0,      0,      254,               0.959
    16,      0,      0,      127,               0.895
    16,      0,      0,      254,               0.901
    32,      0,      0,      127,                0.85
    32,      0,      0,      254,               0.851
    64,      0,      0,      127,               0.897
    64,      0,      0,      254,               0.895
   128,      0,      0,      127,               0.944
   128,      0,      0,      254,               0.935
   256,      0,      0,      127,               0.922
   256,      0,      0,      254,               0.913
   512,      0,      0,      127,               0.921
   512,      0,      0,      254,               0.914
  1024,      0,      0,      127,               0.845
  1024,      0,      0,      254,                0.84
    16,      1,      2,      127,               0.923
    16,      2,      1,      254,               0.955
    32,      2,      4,      127,               0.979
    32,      4,      2,      254,               0.957
    64,      3,      6,      127,               0.866
    64,      6,      3,      254,               0.849
   128,      4,      0,      127,               0.882
   128,      0,      4,      254,               0.876
   256,      5,      2,      127,               0.877
   256,      2,      5,      254,               0.882
   512,      6,      4,      127,               0.822
   512,      4,      6,      254,               0.862
  1024,      7,      6,      127,               0.903
  1024,      6,      7,      254,               0.908

 sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
 1 file changed, 35 insertions(+), 48 deletions(-)
  

Comments

H.J. Lu March 24, 2022, 7:02 p.m. UTC | #1
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .920
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
>      1,      1,      1,      127,               0.914
>      2,      2,      2,      127,               0.952
>      3,      3,      3,      127,               0.924
>      4,      4,      4,      127,               0.995
>      5,      5,      5,      127,               0.985
>      6,      6,      6,      127,               1.017
>      7,      7,      7,      127,               1.031
>      8,      0,      0,      127,               0.967
>      9,      1,      1,      127,               0.969
>     10,      2,      2,      127,               0.951
>     11,      3,      3,      127,               0.938
>     12,      4,      4,      127,               0.937
>     13,      5,      5,      127,               0.967
>     14,      6,      6,      127,               0.941
>     15,      7,      7,      127,               0.951
>      4,      0,      0,      127,               0.959
>      4,      0,      0,      254,                0.98
>      8,      0,      0,      254,               0.959
>     16,      0,      0,      127,               0.895
>     16,      0,      0,      254,               0.901
>     32,      0,      0,      127,                0.85
>     32,      0,      0,      254,               0.851
>     64,      0,      0,      127,               0.897
>     64,      0,      0,      254,               0.895
>    128,      0,      0,      127,               0.944
>    128,      0,      0,      254,               0.935
>    256,      0,      0,      127,               0.922
>    256,      0,      0,      254,               0.913
>    512,      0,      0,      127,               0.921
>    512,      0,      0,      254,               0.914
>   1024,      0,      0,      127,               0.845
>   1024,      0,      0,      254,                0.84
>     16,      1,      2,      127,               0.923
>     16,      2,      1,      254,               0.955
>     32,      2,      4,      127,               0.979
>     32,      4,      2,      254,               0.957
>     64,      3,      6,      127,               0.866
>     64,      6,      3,      254,               0.849
>    128,      4,      0,      127,               0.882
>    128,      0,      4,      254,               0.876
>    256,      5,      2,      127,               0.877
>    256,      2,      5,      254,               0.882
>    512,      6,      4,      127,               0.822
>    512,      4,      6,      254,               0.862
>   1024,      7,      6,      127,               0.903
>   1024,      6,      7,      254,               0.908
>
>  sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
>  1 file changed, 35 insertions(+), 48 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> index 580feb90e9..7805ae9d41 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RDX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END (GLABEL(__strcasecmp))
>         /* FALLTHROUGH to strcasecmp_l.  */
>  #endif
> @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
>         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
>         mov     %fs:(%rax),%RCX_LP
>
> -       // XXX 5 byte should be before the function
> -       /* 5-byte NOP.  */
> -       .byte   0x0f,0x1f,0x44,0x00,0x00
> +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> +       .p2align 4
>  END (GLABEL(__strncasecmp))
>         /* FALLTHROUGH to strncasecmp_l.  */
>  #endif
> @@ -169,27 +167,22 @@ STRCMP_SSE42:
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
>         .section .rodata.cst16,"aM",@progbits,16
>         .align 16
> -LABEL(belowupper):
> -       .quad   0x4040404040404040
> -       .quad   0x4040404040404040
> -LABEL(topupper):
> -# ifdef USE_AVX
> -       .quad   0x5a5a5a5a5a5a5a5a
> -       .quad   0x5a5a5a5a5a5a5a5a
> -# else
> -       .quad   0x5b5b5b5b5b5b5b5b
> -       .quad   0x5b5b5b5b5b5b5b5b
> -# endif
> -LABEL(touppermask):
> +LABEL(lcase_min):
> +       .quad   0x3f3f3f3f3f3f3f3f
> +       .quad   0x3f3f3f3f3f3f3f3f
> +LABEL(lcase_max):
> +       .quad   0x9999999999999999
> +       .quad   0x9999999999999999
> +LABEL(case_add):
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .previous
> -       movdqa  LABEL(belowupper)(%rip), %xmm4
> -# define UCLOW_reg %xmm4
> -       movdqa  LABEL(topupper)(%rip), %xmm5
> -# define UCHIGH_reg %xmm5
> -       movdqa  LABEL(touppermask)(%rip), %xmm6
> -# define LCQWORD_reg %xmm6
> +       movdqa  LABEL(lcase_min)(%rip), %xmm4
> +# define LCASE_MIN_reg %xmm4
> +       movdqa  LABEL(lcase_max)(%rip), %xmm5
> +# define LCASE_MAX_reg %xmm5
> +       movdqa  LABEL(case_add)(%rip), %xmm6
> +# define CASE_ADD_reg %xmm6
>  #endif
>         cmp     $0x30, %ecx
>         ja      LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> @@ -200,32 +193,26 @@ LABEL(touppermask):
>  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
>  # ifdef USE_AVX
>  #  define TOLOWER(reg1, reg2) \
> -       vpcmpgtb UCLOW_reg, reg1, %xmm7;                        \
> -       vpcmpgtb UCHIGH_reg, reg1, %xmm8;                       \
> -       vpcmpgtb UCLOW_reg, reg2, %xmm9;                        \
> -       vpcmpgtb UCHIGH_reg, reg2, %xmm10;                      \
> -       vpandn  %xmm7, %xmm8, %xmm8;                                    \
> -       vpandn  %xmm9, %xmm10, %xmm10;                                  \
> -       vpand   LCQWORD_reg, %xmm8, %xmm8;                              \
> -       vpand   LCQWORD_reg, %xmm10, %xmm10;                            \
> -       vpor    reg1, %xmm8, reg1;                                      \
> -       vpor    reg2, %xmm10, reg2
> +       vpaddb  LCASE_MIN_reg, reg1, %xmm7;                                     \
> +       vpaddb  LCASE_MIN_reg, reg2, %xmm8;                                     \
> +       vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;                                   \
> +       vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;                                   \
> +       vpandn  CASE_ADD_reg, %xmm7, %xmm7;                                     \
> +       vpandn  CASE_ADD_reg, %xmm8, %xmm8;                                     \
> +       vpaddb  %xmm7, reg1, reg1;                                      \
> +       vpaddb  %xmm8, reg2, reg2
>  # else
>  #  define TOLOWER(reg1, reg2) \
> -       movdqa  reg1, %xmm7;                                    \
> -       movdqa  UCHIGH_reg, %xmm8;                              \
> -       movdqa  reg2, %xmm9;                                    \
> -       movdqa  UCHIGH_reg, %xmm10;                             \
> -       pcmpgtb UCLOW_reg, %xmm7;                               \
> -       pcmpgtb reg1, %xmm8;                                    \
> -       pcmpgtb UCLOW_reg, %xmm9;                               \
> -       pcmpgtb reg2, %xmm10;                                   \
> -       pand    %xmm8, %xmm7;                                   \
> -       pand    %xmm10, %xmm9;                                  \
> -       pand    LCQWORD_reg, %xmm7;                             \
> -       pand    LCQWORD_reg, %xmm9;                             \
> -       por     %xmm7, reg1;                                    \
> -       por     %xmm9, reg2
> +       movdqa  LCASE_MIN_reg, %xmm7;                                   \
> +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> +       paddb   reg1, %xmm7;                                    \
> +       paddb   reg2, %xmm8;                                    \
> +       pcmpgtb LCASE_MAX_reg, %xmm7;                           \
> +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> +       pandn   CASE_ADD_reg, %xmm7;                                    \
> +       pandn   CASE_ADD_reg, %xmm8;                                    \
> +       paddb   %xmm7, reg1;                                    \
> +       paddb   %xmm8, reg2
>  # endif
>         TOLOWER (%xmm1, %xmm2)
>  #else
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Sunil Pandey May 12, 2022, 7:45 p.m. UTC | #2
On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Slightly faster method of doing TOLOWER that saves an
> > instruction.
> >
> > Also replace the hard coded 5-byte no with .p2align 4. On builds with
> > CET enabled this misaligned entry to strcasecmp.
> >
> > geometric_mean(N=40) of all benchmarks New / Original: .920
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, New Time / Old Time
> >      1,      1,      1,      127,               0.914
> >      2,      2,      2,      127,               0.952
> >      3,      3,      3,      127,               0.924
> >      4,      4,      4,      127,               0.995
> >      5,      5,      5,      127,               0.985
> >      6,      6,      6,      127,               1.017
> >      7,      7,      7,      127,               1.031
> >      8,      0,      0,      127,               0.967
> >      9,      1,      1,      127,               0.969
> >     10,      2,      2,      127,               0.951
> >     11,      3,      3,      127,               0.938
> >     12,      4,      4,      127,               0.937
> >     13,      5,      5,      127,               0.967
> >     14,      6,      6,      127,               0.941
> >     15,      7,      7,      127,               0.951
> >      4,      0,      0,      127,               0.959
> >      4,      0,      0,      254,                0.98
> >      8,      0,      0,      254,               0.959
> >     16,      0,      0,      127,               0.895
> >     16,      0,      0,      254,               0.901
> >     32,      0,      0,      127,                0.85
> >     32,      0,      0,      254,               0.851
> >     64,      0,      0,      127,               0.897
> >     64,      0,      0,      254,               0.895
> >    128,      0,      0,      127,               0.944
> >    128,      0,      0,      254,               0.935
> >    256,      0,      0,      127,               0.922
> >    256,      0,      0,      254,               0.913
> >    512,      0,      0,      127,               0.921
> >    512,      0,      0,      254,               0.914
> >   1024,      0,      0,      127,               0.845
> >   1024,      0,      0,      254,                0.84
> >     16,      1,      2,      127,               0.923
> >     16,      2,      1,      254,               0.955
> >     32,      2,      4,      127,               0.979
> >     32,      4,      2,      254,               0.957
> >     64,      3,      6,      127,               0.866
> >     64,      6,      3,      254,               0.849
> >    128,      4,      0,      127,               0.882
> >    128,      0,      4,      254,               0.876
> >    256,      5,      2,      127,               0.877
> >    256,      2,      5,      254,               0.882
> >    512,      6,      4,      127,               0.822
> >    512,      4,      6,      254,               0.862
> >   1024,      7,      6,      127,               0.903
> >   1024,      6,      7,      254,               0.908
> >
> >  sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
> >  1 file changed, 35 insertions(+), 48 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > index 580feb90e9..7805ae9d41 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
> >         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
> >         mov     %fs:(%rax),%RDX_LP
> >
> > -       // XXX 5 byte should be before the function
> > -       /* 5-byte NOP.  */
> > -       .byte   0x0f,0x1f,0x44,0x00,0x00
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> >  END (GLABEL(__strcasecmp))
> >         /* FALLTHROUGH to strcasecmp_l.  */
> >  #endif
> > @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
> >         movq    __libc_tsd_LOCALE@gottpoff(%rip),%rax
> >         mov     %fs:(%rax),%RCX_LP
> >
> > -       // XXX 5 byte should be before the function
> > -       /* 5-byte NOP.  */
> > -       .byte   0x0f,0x1f,0x44,0x00,0x00
> > +       /* Either 1 or 5 bytes (dependeing if CET is enabled).  */
> > +       .p2align 4
> >  END (GLABEL(__strncasecmp))
> >         /* FALLTHROUGH to strncasecmp_l.  */
> >  #endif
> > @@ -169,27 +167,22 @@ STRCMP_SSE42:
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> >         .section .rodata.cst16,"aM",@progbits,16
> >         .align 16
> > -LABEL(belowupper):
> > -       .quad   0x4040404040404040
> > -       .quad   0x4040404040404040
> > -LABEL(topupper):
> > -# ifdef USE_AVX
> > -       .quad   0x5a5a5a5a5a5a5a5a
> > -       .quad   0x5a5a5a5a5a5a5a5a
> > -# else
> > -       .quad   0x5b5b5b5b5b5b5b5b
> > -       .quad   0x5b5b5b5b5b5b5b5b
> > -# endif
> > -LABEL(touppermask):
> > +LABEL(lcase_min):
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +       .quad   0x3f3f3f3f3f3f3f3f
> > +LABEL(lcase_max):
> > +       .quad   0x9999999999999999
> > +       .quad   0x9999999999999999
> > +LABEL(case_add):
> >         .quad   0x2020202020202020
> >         .quad   0x2020202020202020
> >         .previous
> > -       movdqa  LABEL(belowupper)(%rip), %xmm4
> > -# define UCLOW_reg %xmm4
> > -       movdqa  LABEL(topupper)(%rip), %xmm5
> > -# define UCHIGH_reg %xmm5
> > -       movdqa  LABEL(touppermask)(%rip), %xmm6
> > -# define LCQWORD_reg %xmm6
> > +       movdqa  LABEL(lcase_min)(%rip), %xmm4
> > +# define LCASE_MIN_reg %xmm4
> > +       movdqa  LABEL(lcase_max)(%rip), %xmm5
> > +# define LCASE_MAX_reg %xmm5
> > +       movdqa  LABEL(case_add)(%rip), %xmm6
> > +# define CASE_ADD_reg %xmm6
> >  #endif
> >         cmp     $0x30, %ecx
> >         ja      LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> > @@ -200,32 +193,26 @@ LABEL(touppermask):
> >  #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> >  # ifdef USE_AVX
> >  #  define TOLOWER(reg1, reg2) \
> > -       vpcmpgtb UCLOW_reg, reg1, %xmm7;                        \
> > -       vpcmpgtb UCHIGH_reg, reg1, %xmm8;                       \
> > -       vpcmpgtb UCLOW_reg, reg2, %xmm9;                        \
> > -       vpcmpgtb UCHIGH_reg, reg2, %xmm10;                      \
> > -       vpandn  %xmm7, %xmm8, %xmm8;                                    \
> > -       vpandn  %xmm9, %xmm10, %xmm10;                                  \
> > -       vpand   LCQWORD_reg, %xmm8, %xmm8;                              \
> > -       vpand   LCQWORD_reg, %xmm10, %xmm10;                            \
> > -       vpor    reg1, %xmm8, reg1;                                      \
> > -       vpor    reg2, %xmm10, reg2
> > +       vpaddb  LCASE_MIN_reg, reg1, %xmm7;                                     \
> > +       vpaddb  LCASE_MIN_reg, reg2, %xmm8;                                     \
> > +       vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;                                   \
> > +       vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;                                   \
> > +       vpandn  CASE_ADD_reg, %xmm7, %xmm7;                                     \
> > +       vpandn  CASE_ADD_reg, %xmm8, %xmm8;                                     \
> > +       vpaddb  %xmm7, reg1, reg1;                                      \
> > +       vpaddb  %xmm8, reg2, reg2
> >  # else
> >  #  define TOLOWER(reg1, reg2) \
> > -       movdqa  reg1, %xmm7;                                    \
> > -       movdqa  UCHIGH_reg, %xmm8;                              \
> > -       movdqa  reg2, %xmm9;                                    \
> > -       movdqa  UCHIGH_reg, %xmm10;                             \
> > -       pcmpgtb UCLOW_reg, %xmm7;                               \
> > -       pcmpgtb reg1, %xmm8;                                    \
> > -       pcmpgtb UCLOW_reg, %xmm9;                               \
> > -       pcmpgtb reg2, %xmm10;                                   \
> > -       pand    %xmm8, %xmm7;                                   \
> > -       pand    %xmm10, %xmm9;                                  \
> > -       pand    LCQWORD_reg, %xmm7;                             \
> > -       pand    LCQWORD_reg, %xmm9;                             \
> > -       por     %xmm7, reg1;                                    \
> > -       por     %xmm9, reg2
> > +       movdqa  LCASE_MIN_reg, %xmm7;                                   \
> > +       movdqa  LCASE_MIN_reg, %xmm8;                                   \
> > +       paddb   reg1, %xmm7;                                    \
> > +       paddb   reg2, %xmm8;                                    \
> > +       pcmpgtb LCASE_MAX_reg, %xmm7;                           \
> > +       pcmpgtb LCASE_MAX_reg, %xmm8;                           \
> > +       pandn   CASE_ADD_reg, %xmm7;                                    \
> > +       pandn   CASE_ADD_reg, %xmm8;                                    \
> > +       paddb   %xmm7, reg1;                                    \
> > +       paddb   %xmm8, reg2
> >  # endif
> >         TOLOWER (%xmm1, %xmm2)
> >  #else
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 580feb90e9..7805ae9d41 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -88,9 +88,8 @@  ENTRY (GLABEL(__strcasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strcasecmp))
 	/* FALLTHROUGH to strcasecmp_l.  */
 #endif
@@ -99,9 +98,8 @@  ENTRY (GLABEL(__strncasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strncasecmp))
 	/* FALLTHROUGH to strncasecmp_l.  */
 #endif
@@ -169,27 +167,22 @@  STRCMP_SSE42:
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-LABEL(belowupper):
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
-	.quad	0x5a5a5a5a5a5a5a5a
-	.quad	0x5a5a5a5a5a5a5a5a
-# else
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+LABEL(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
-	movdqa	LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
-	movdqa	LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+	movdqa	LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+	movdqa	LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+	movdqa	LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@@ -200,32 +193,26 @@  LABEL(touppermask):
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 # ifdef USE_AVX
 #  define TOLOWER(reg1, reg2) \
-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
-	vpandn	%xmm7, %xmm8, %xmm8;					\
-	vpandn	%xmm9, %xmm10, %xmm10;					\
-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
-	vpor	reg1, %xmm8, reg1;					\
-	vpor	reg2, %xmm10, reg2
+	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
+	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
+	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
+	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
+	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
+	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
+	vpaddb	%xmm7, reg1, reg1;					\
+	vpaddb	%xmm8, reg2, reg2
 # else
 #  define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm7;					\
-	movdqa	UCHIGH_reg, %xmm8;				\
-	movdqa	reg2, %xmm9;					\
-	movdqa	UCHIGH_reg, %xmm10;				\
-	pcmpgtb	UCLOW_reg, %xmm7;				\
-	pcmpgtb	reg1, %xmm8;					\
-	pcmpgtb	UCLOW_reg, %xmm9;				\
-	pcmpgtb	reg2, %xmm10;					\
-	pand	%xmm8, %xmm7;					\
-	pand	%xmm10, %xmm9;					\
-	pand	LCQWORD_reg, %xmm7;				\
-	pand	LCQWORD_reg, %xmm9;				\
-	por	%xmm7, reg1;					\
-	por	%xmm9, reg2
+	movdqa	LCASE_MIN_reg, %xmm7;					\
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	paddb	reg1, %xmm7;					\
+	paddb	reg2, %xmm8;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pandn	CASE_ADD_reg, %xmm7;					\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	paddb	%xmm7, reg1;					\
+	paddb	%xmm8, reg2
 # endif
 	TOLOWER (%xmm1, %xmm2)
 #else