[v1,18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
Slightly faster method of doing TOLOWER that saves an
instruction.
Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.
geometric_mean(N=40) of all benchmarks New / Original: .920
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
1, 1, 1, 127, 0.914
2, 2, 2, 127, 0.952
3, 3, 3, 127, 0.924
4, 4, 4, 127, 0.995
5, 5, 5, 127, 0.985
6, 6, 6, 127, 1.017
7, 7, 7, 127, 1.031
8, 0, 0, 127, 0.967
9, 1, 1, 127, 0.969
10, 2, 2, 127, 0.951
11, 3, 3, 127, 0.938
12, 4, 4, 127, 0.937
13, 5, 5, 127, 0.967
14, 6, 6, 127, 0.941
15, 7, 7, 127, 0.951
4, 0, 0, 127, 0.959
4, 0, 0, 254, 0.98
8, 0, 0, 254, 0.959
16, 0, 0, 127, 0.895
16, 0, 0, 254, 0.901
32, 0, 0, 127, 0.85
32, 0, 0, 254, 0.851
64, 0, 0, 127, 0.897
64, 0, 0, 254, 0.895
128, 0, 0, 127, 0.944
128, 0, 0, 254, 0.935
256, 0, 0, 127, 0.922
256, 0, 0, 254, 0.913
512, 0, 0, 127, 0.921
512, 0, 0, 254, 0.914
1024, 0, 0, 127, 0.845
1024, 0, 0, 254, 0.84
16, 1, 2, 127, 0.923
16, 2, 1, 254, 0.955
32, 2, 4, 127, 0.979
32, 4, 2, 254, 0.957
64, 3, 6, 127, 0.866
64, 6, 3, 254, 0.849
128, 4, 0, 127, 0.882
128, 0, 4, 254, 0.876
256, 5, 2, 127, 0.877
256, 2, 5, 254, 0.882
512, 6, 4, 127, 0.822
512, 4, 6, 254, 0.862
1024, 7, 6, 127, 0.903
1024, 6, 7, 254, 0.908
sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
1 file changed, 35 insertions(+), 48 deletions(-)
Comments
On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .920
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
> 1, 1, 1, 127, 0.914
> 2, 2, 2, 127, 0.952
> 3, 3, 3, 127, 0.924
> 4, 4, 4, 127, 0.995
> 5, 5, 5, 127, 0.985
> 6, 6, 6, 127, 1.017
> 7, 7, 7, 127, 1.031
> 8, 0, 0, 127, 0.967
> 9, 1, 1, 127, 0.969
> 10, 2, 2, 127, 0.951
> 11, 3, 3, 127, 0.938
> 12, 4, 4, 127, 0.937
> 13, 5, 5, 127, 0.967
> 14, 6, 6, 127, 0.941
> 15, 7, 7, 127, 0.951
> 4, 0, 0, 127, 0.959
> 4, 0, 0, 254, 0.98
> 8, 0, 0, 254, 0.959
> 16, 0, 0, 127, 0.895
> 16, 0, 0, 254, 0.901
> 32, 0, 0, 127, 0.85
> 32, 0, 0, 254, 0.851
> 64, 0, 0, 127, 0.897
> 64, 0, 0, 254, 0.895
> 128, 0, 0, 127, 0.944
> 128, 0, 0, 254, 0.935
> 256, 0, 0, 127, 0.922
> 256, 0, 0, 254, 0.913
> 512, 0, 0, 127, 0.921
> 512, 0, 0, 254, 0.914
> 1024, 0, 0, 127, 0.845
> 1024, 0, 0, 254, 0.84
> 16, 1, 2, 127, 0.923
> 16, 2, 1, 254, 0.955
> 32, 2, 4, 127, 0.979
> 32, 4, 2, 254, 0.957
> 64, 3, 6, 127, 0.866
> 64, 6, 3, 254, 0.849
> 128, 4, 0, 127, 0.882
> 128, 0, 4, 254, 0.876
> 256, 5, 2, 127, 0.877
> 256, 2, 5, 254, 0.882
> 512, 6, 4, 127, 0.822
> 512, 4, 6, 254, 0.862
> 1024, 7, 6, 127, 0.903
> 1024, 6, 7, 254, 0.908
>
> sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
> 1 file changed, 35 insertions(+), 48 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> index 580feb90e9..7805ae9d41 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RDX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END (GLABEL(__strcasecmp))
> /* FALLTHROUGH to strcasecmp_l. */
> #endif
> @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RCX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END (GLABEL(__strncasecmp))
> /* FALLTHROUGH to strncasecmp_l. */
> #endif
> @@ -169,27 +167,22 @@ STRCMP_SSE42:
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> .section .rodata.cst16,"aM",@progbits,16
> .align 16
> -LABEL(belowupper):
> - .quad 0x4040404040404040
> - .quad 0x4040404040404040
> -LABEL(topupper):
> -# ifdef USE_AVX
> - .quad 0x5a5a5a5a5a5a5a5a
> - .quad 0x5a5a5a5a5a5a5a5a
> -# else
> - .quad 0x5b5b5b5b5b5b5b5b
> - .quad 0x5b5b5b5b5b5b5b5b
> -# endif
> -LABEL(touppermask):
> +LABEL(lcase_min):
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +LABEL(lcase_max):
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +LABEL(case_add):
> .quad 0x2020202020202020
> .quad 0x2020202020202020
> .previous
> - movdqa LABEL(belowupper)(%rip), %xmm4
> -# define UCLOW_reg %xmm4
> - movdqa LABEL(topupper)(%rip), %xmm5
> -# define UCHIGH_reg %xmm5
> - movdqa LABEL(touppermask)(%rip), %xmm6
> -# define LCQWORD_reg %xmm6
> + movdqa LABEL(lcase_min)(%rip), %xmm4
> +# define LCASE_MIN_reg %xmm4
> + movdqa LABEL(lcase_max)(%rip), %xmm5
> +# define LCASE_MAX_reg %xmm5
> + movdqa LABEL(case_add)(%rip), %xmm6
> +# define CASE_ADD_reg %xmm6
> #endif
> cmp $0x30, %ecx
> ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> @@ -200,32 +193,26 @@ LABEL(touppermask):
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> # ifdef USE_AVX
> # define TOLOWER(reg1, reg2) \
> - vpcmpgtb UCLOW_reg, reg1, %xmm7; \
> - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
> - vpcmpgtb UCLOW_reg, reg2, %xmm9; \
> - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
> - vpandn %xmm7, %xmm8, %xmm8; \
> - vpandn %xmm9, %xmm10, %xmm10; \
> - vpand LCQWORD_reg, %xmm8, %xmm8; \
> - vpand LCQWORD_reg, %xmm10, %xmm10; \
> - vpor reg1, %xmm8, reg1; \
> - vpor reg2, %xmm10, reg2
> + vpaddb LCASE_MIN_reg, reg1, %xmm7; \
> + vpaddb LCASE_MIN_reg, reg2, %xmm8; \
> + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
> + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
> + vpandn CASE_ADD_reg, %xmm7, %xmm7; \
> + vpandn CASE_ADD_reg, %xmm8, %xmm8; \
> + vpaddb %xmm7, reg1, reg1; \
> + vpaddb %xmm8, reg2, reg2
> # else
> # define TOLOWER(reg1, reg2) \
> - movdqa reg1, %xmm7; \
> - movdqa UCHIGH_reg, %xmm8; \
> - movdqa reg2, %xmm9; \
> - movdqa UCHIGH_reg, %xmm10; \
> - pcmpgtb UCLOW_reg, %xmm7; \
> - pcmpgtb reg1, %xmm8; \
> - pcmpgtb UCLOW_reg, %xmm9; \
> - pcmpgtb reg2, %xmm10; \
> - pand %xmm8, %xmm7; \
> - pand %xmm10, %xmm9; \
> - pand LCQWORD_reg, %xmm7; \
> - pand LCQWORD_reg, %xmm9; \
> - por %xmm7, reg1; \
> - por %xmm9, reg2
> + movdqa LCASE_MIN_reg, %xmm7; \
> + movdqa LCASE_MIN_reg, %xmm8; \
> + paddb reg1, %xmm7; \
> + paddb reg2, %xmm8; \
> + pcmpgtb LCASE_MAX_reg, %xmm7; \
> + pcmpgtb LCASE_MAX_reg, %xmm8; \
> + pandn CASE_ADD_reg, %xmm7; \
> + pandn CASE_ADD_reg, %xmm8; \
> + paddb %xmm7, reg1; \
> + paddb %xmm8, reg2
> # endif
> TOLOWER (%xmm1, %xmm2)
> #else
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Slightly faster method of doing TOLOWER that saves an
> > instruction.
> >
> > Also replace the hard coded 5-byte no with .p2align 4. On builds with
> > CET enabled this misaligned entry to strcasecmp.
> >
> > geometric_mean(N=40) of all benchmarks New / Original: .920
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, New Time / Old Time
> > 1, 1, 1, 127, 0.914
> > 2, 2, 2, 127, 0.952
> > 3, 3, 3, 127, 0.924
> > 4, 4, 4, 127, 0.995
> > 5, 5, 5, 127, 0.985
> > 6, 6, 6, 127, 1.017
> > 7, 7, 7, 127, 1.031
> > 8, 0, 0, 127, 0.967
> > 9, 1, 1, 127, 0.969
> > 10, 2, 2, 127, 0.951
> > 11, 3, 3, 127, 0.938
> > 12, 4, 4, 127, 0.937
> > 13, 5, 5, 127, 0.967
> > 14, 6, 6, 127, 0.941
> > 15, 7, 7, 127, 0.951
> > 4, 0, 0, 127, 0.959
> > 4, 0, 0, 254, 0.98
> > 8, 0, 0, 254, 0.959
> > 16, 0, 0, 127, 0.895
> > 16, 0, 0, 254, 0.901
> > 32, 0, 0, 127, 0.85
> > 32, 0, 0, 254, 0.851
> > 64, 0, 0, 127, 0.897
> > 64, 0, 0, 254, 0.895
> > 128, 0, 0, 127, 0.944
> > 128, 0, 0, 254, 0.935
> > 256, 0, 0, 127, 0.922
> > 256, 0, 0, 254, 0.913
> > 512, 0, 0, 127, 0.921
> > 512, 0, 0, 254, 0.914
> > 1024, 0, 0, 127, 0.845
> > 1024, 0, 0, 254, 0.84
> > 16, 1, 2, 127, 0.923
> > 16, 2, 1, 254, 0.955
> > 32, 2, 4, 127, 0.979
> > 32, 4, 2, 254, 0.957
> > 64, 3, 6, 127, 0.866
> > 64, 6, 3, 254, 0.849
> > 128, 4, 0, 127, 0.882
> > 128, 0, 4, 254, 0.876
> > 256, 5, 2, 127, 0.877
> > 256, 2, 5, 254, 0.882
> > 512, 6, 4, 127, 0.822
> > 512, 4, 6, 254, 0.862
> > 1024, 7, 6, 127, 0.903
> > 1024, 6, 7, 254, 0.908
> >
> > sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
> > 1 file changed, 35 insertions(+), 48 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > index 580feb90e9..7805ae9d41 100644
> > --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> > @@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
> > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> > mov %fs:(%rax),%RDX_LP
> >
> > - // XXX 5 byte should be before the function
> > - /* 5-byte NOP. */
> > - .byte 0x0f,0x1f,0x44,0x00,0x00
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > END (GLABEL(__strcasecmp))
> > /* FALLTHROUGH to strcasecmp_l. */
> > #endif
> > @@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
> > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> > mov %fs:(%rax),%RCX_LP
> >
> > - // XXX 5 byte should be before the function
> > - /* 5-byte NOP. */
> > - .byte 0x0f,0x1f,0x44,0x00,0x00
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > END (GLABEL(__strncasecmp))
> > /* FALLTHROUGH to strncasecmp_l. */
> > #endif
> > @@ -169,27 +167,22 @@ STRCMP_SSE42:
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > .section .rodata.cst16,"aM",@progbits,16
> > .align 16
> > -LABEL(belowupper):
> > - .quad 0x4040404040404040
> > - .quad 0x4040404040404040
> > -LABEL(topupper):
> > -# ifdef USE_AVX
> > - .quad 0x5a5a5a5a5a5a5a5a
> > - .quad 0x5a5a5a5a5a5a5a5a
> > -# else
> > - .quad 0x5b5b5b5b5b5b5b5b
> > - .quad 0x5b5b5b5b5b5b5b5b
> > -# endif
> > -LABEL(touppermask):
> > +LABEL(lcase_min):
> > + .quad 0x3f3f3f3f3f3f3f3f
> > + .quad 0x3f3f3f3f3f3f3f3f
> > +LABEL(lcase_max):
> > + .quad 0x9999999999999999
> > + .quad 0x9999999999999999
> > +LABEL(case_add):
> > .quad 0x2020202020202020
> > .quad 0x2020202020202020
> > .previous
> > - movdqa LABEL(belowupper)(%rip), %xmm4
> > -# define UCLOW_reg %xmm4
> > - movdqa LABEL(topupper)(%rip), %xmm5
> > -# define UCHIGH_reg %xmm5
> > - movdqa LABEL(touppermask)(%rip), %xmm6
> > -# define LCQWORD_reg %xmm6
> > + movdqa LABEL(lcase_min)(%rip), %xmm4
> > +# define LCASE_MIN_reg %xmm4
> > + movdqa LABEL(lcase_max)(%rip), %xmm5
> > +# define LCASE_MAX_reg %xmm5
> > + movdqa LABEL(case_add)(%rip), %xmm6
> > +# define CASE_ADD_reg %xmm6
> > #endif
> > cmp $0x30, %ecx
> > ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> > @@ -200,32 +193,26 @@ LABEL(touppermask):
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > # ifdef USE_AVX
> > # define TOLOWER(reg1, reg2) \
> > - vpcmpgtb UCLOW_reg, reg1, %xmm7; \
> > - vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
> > - vpcmpgtb UCLOW_reg, reg2, %xmm9; \
> > - vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
> > - vpandn %xmm7, %xmm8, %xmm8; \
> > - vpandn %xmm9, %xmm10, %xmm10; \
> > - vpand LCQWORD_reg, %xmm8, %xmm8; \
> > - vpand LCQWORD_reg, %xmm10, %xmm10; \
> > - vpor reg1, %xmm8, reg1; \
> > - vpor reg2, %xmm10, reg2
> > + vpaddb LCASE_MIN_reg, reg1, %xmm7; \
> > + vpaddb LCASE_MIN_reg, reg2, %xmm8; \
> > + vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
> > + vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
> > + vpandn CASE_ADD_reg, %xmm7, %xmm7; \
> > + vpandn CASE_ADD_reg, %xmm8, %xmm8; \
> > + vpaddb %xmm7, reg1, reg1; \
> > + vpaddb %xmm8, reg2, reg2
> > # else
> > # define TOLOWER(reg1, reg2) \
> > - movdqa reg1, %xmm7; \
> > - movdqa UCHIGH_reg, %xmm8; \
> > - movdqa reg2, %xmm9; \
> > - movdqa UCHIGH_reg, %xmm10; \
> > - pcmpgtb UCLOW_reg, %xmm7; \
> > - pcmpgtb reg1, %xmm8; \
> > - pcmpgtb UCLOW_reg, %xmm9; \
> > - pcmpgtb reg2, %xmm10; \
> > - pand %xmm8, %xmm7; \
> > - pand %xmm10, %xmm9; \
> > - pand LCQWORD_reg, %xmm7; \
> > - pand LCQWORD_reg, %xmm9; \
> > - por %xmm7, reg1; \
> > - por %xmm9, reg2
> > + movdqa LCASE_MIN_reg, %xmm7; \
> > + movdqa LCASE_MIN_reg, %xmm8; \
> > + paddb reg1, %xmm7; \
> > + paddb reg2, %xmm8; \
> > + pcmpgtb LCASE_MAX_reg, %xmm7; \
> > + pcmpgtb LCASE_MAX_reg, %xmm8; \
> > + pandn CASE_ADD_reg, %xmm7; \
> > + pandn CASE_ADD_reg, %xmm8; \
> > + paddb %xmm7, reg1; \
> > + paddb %xmm8, reg2
> > # endif
> > TOLOWER (%xmm1, %xmm2)
> > #else
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
@@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END (GLABEL(__strcasecmp))
/* FALLTHROUGH to strcasecmp_l. */
#endif
@@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END (GLABEL(__strncasecmp))
/* FALLTHROUGH to strncasecmp_l. */
#endif
@@ -169,27 +167,22 @@ STRCMP_SSE42:
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
.section .rodata.cst16,"aM",@progbits,16
.align 16
-LABEL(belowupper):
- .quad 0x4040404040404040
- .quad 0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
- .quad 0x5a5a5a5a5a5a5a5a
- .quad 0x5a5a5a5a5a5a5a5a
-# else
- .quad 0x5b5b5b5b5b5b5b5b
- .quad 0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+LABEL(case_add):
.quad 0x2020202020202020
.quad 0x2020202020202020
.previous
- movdqa LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
- movdqa LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
- movdqa LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+ movdqa LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+ movdqa LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+ movdqa LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
#endif
cmp $0x30, %ecx
ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@@ -200,32 +193,26 @@ LABEL(touppermask):
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
# ifdef USE_AVX
# define TOLOWER(reg1, reg2) \
- vpcmpgtb UCLOW_reg, reg1, %xmm7; \
- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
- vpcmpgtb UCLOW_reg, reg2, %xmm9; \
- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
- vpandn %xmm7, %xmm8, %xmm8; \
- vpandn %xmm9, %xmm10, %xmm10; \
- vpand LCQWORD_reg, %xmm8, %xmm8; \
- vpand LCQWORD_reg, %xmm10, %xmm10; \
- vpor reg1, %xmm8, reg1; \
- vpor reg2, %xmm10, reg2
+ vpaddb LCASE_MIN_reg, reg1, %xmm7; \
+ vpaddb LCASE_MIN_reg, reg2, %xmm8; \
+ vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \
+ vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \
+ vpandn CASE_ADD_reg, %xmm7, %xmm7; \
+ vpandn CASE_ADD_reg, %xmm8, %xmm8; \
+ vpaddb %xmm7, reg1, reg1; \
+ vpaddb %xmm8, reg2, reg2
# else
# define TOLOWER(reg1, reg2) \
- movdqa reg1, %xmm7; \
- movdqa UCHIGH_reg, %xmm8; \
- movdqa reg2, %xmm9; \
- movdqa UCHIGH_reg, %xmm10; \
- pcmpgtb UCLOW_reg, %xmm7; \
- pcmpgtb reg1, %xmm8; \
- pcmpgtb UCLOW_reg, %xmm9; \
- pcmpgtb reg2, %xmm10; \
- pand %xmm8, %xmm7; \
- pand %xmm10, %xmm9; \
- pand LCQWORD_reg, %xmm7; \
- pand LCQWORD_reg, %xmm9; \
- por %xmm7, reg1; \
- por %xmm9, reg2
+ movdqa LCASE_MIN_reg, %xmm7; \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ paddb reg1, %xmm7; \
+ paddb reg2, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm7; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm7; \
+ pandn CASE_ADD_reg, %xmm8; \
+ paddb %xmm7, reg1; \
+ paddb %xmm8, reg2
# endif
TOLOWER (%xmm1, %xmm2)
#else