[v1,17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
Slightly faster method of doing TOLOWER that saves an
instruction.
Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.
geometric_mean(N=40) of all benchmarks New / Original: .894
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
1, 1, 1, 127, 0.903
2, 2, 2, 127, 0.905
3, 3, 3, 127, 0.877
4, 4, 4, 127, 0.888
5, 5, 5, 127, 0.901
6, 6, 6, 127, 0.954
7, 7, 7, 127, 0.932
8, 0, 0, 127, 0.918
9, 1, 1, 127, 0.914
10, 2, 2, 127, 0.877
11, 3, 3, 127, 0.909
12, 4, 4, 127, 0.876
13, 5, 5, 127, 0.886
14, 6, 6, 127, 0.914
15, 7, 7, 127, 0.939
4, 0, 0, 127, 0.963
4, 0, 0, 254, 0.943
8, 0, 0, 254, 0.927
16, 0, 0, 127, 0.876
16, 0, 0, 254, 0.865
32, 0, 0, 127, 0.865
32, 0, 0, 254, 0.862
64, 0, 0, 127, 0.863
64, 0, 0, 254, 0.896
128, 0, 0, 127, 0.885
128, 0, 0, 254, 0.882
256, 0, 0, 127, 0.87
256, 0, 0, 254, 0.869
512, 0, 0, 127, 0.832
512, 0, 0, 254, 0.848
1024, 0, 0, 127, 0.835
1024, 0, 0, 254, 0.843
16, 1, 2, 127, 0.914
16, 2, 1, 254, 0.949
32, 2, 4, 127, 0.955
32, 4, 2, 254, 1.004
64, 3, 6, 127, 0.844
64, 6, 3, 254, 0.905
128, 4, 0, 127, 0.889
128, 0, 4, 254, 0.845
256, 5, 2, 127, 0.929
256, 2, 5, 254, 0.907
512, 6, 4, 127, 0.837
512, 4, 6, 254, 0.862
1024, 7, 6, 127, 0.895
1024, 6, 7, 254, 0.89
sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
1 file changed, 29 insertions(+), 35 deletions(-)
Comments
On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Slightly faster method of doing TOLOWER that saves an
> instruction.
>
> Also replace the hard coded 5-byte no with .p2align 4. On builds with
> CET enabled this misaligned entry to strcasecmp.
>
> geometric_mean(N=40) of all benchmarks New / Original: .894
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=40 runs; All functions page aligned
> length, align1, align2, max_char, New Time / Old Time
> 1, 1, 1, 127, 0.903
> 2, 2, 2, 127, 0.905
> 3, 3, 3, 127, 0.877
> 4, 4, 4, 127, 0.888
> 5, 5, 5, 127, 0.901
> 6, 6, 6, 127, 0.954
> 7, 7, 7, 127, 0.932
> 8, 0, 0, 127, 0.918
> 9, 1, 1, 127, 0.914
> 10, 2, 2, 127, 0.877
> 11, 3, 3, 127, 0.909
> 12, 4, 4, 127, 0.876
> 13, 5, 5, 127, 0.886
> 14, 6, 6, 127, 0.914
> 15, 7, 7, 127, 0.939
> 4, 0, 0, 127, 0.963
> 4, 0, 0, 254, 0.943
> 8, 0, 0, 254, 0.927
> 16, 0, 0, 127, 0.876
> 16, 0, 0, 254, 0.865
> 32, 0, 0, 127, 0.865
> 32, 0, 0, 254, 0.862
> 64, 0, 0, 127, 0.863
> 64, 0, 0, 254, 0.896
> 128, 0, 0, 127, 0.885
> 128, 0, 0, 254, 0.882
> 256, 0, 0, 127, 0.87
> 256, 0, 0, 254, 0.869
> 512, 0, 0, 127, 0.832
> 512, 0, 0, 254, 0.848
> 1024, 0, 0, 127, 0.835
> 1024, 0, 0, 254, 0.843
> 16, 1, 2, 127, 0.914
> 16, 2, 1, 254, 0.949
> 32, 2, 4, 127, 0.955
> 32, 4, 2, 254, 1.004
> 64, 3, 6, 127, 0.844
> 64, 6, 3, 254, 0.905
> 128, 4, 0, 127, 0.889
> 128, 0, 4, 254, 0.845
> 256, 5, 2, 127, 0.929
> 256, 2, 5, 254, 0.907
> 512, 6, 4, 127, 0.837
> 512, 4, 6, 254, 0.862
> 1024, 7, 6, 127, 0.895
> 1024, 6, 7, 254, 0.89
>
> sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
> 1 file changed, 29 insertions(+), 35 deletions(-)
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index e2ab59c555..99d8b36f1d 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RDX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END2 (__strcasecmp)
> # ifndef NO_NOLOCALE_ALIAS
> weak_alias (__strcasecmp, strcasecmp)
> @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
> movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> mov %fs:(%rax),%RCX_LP
>
> - // XXX 5 byte should be before the function
> - /* 5-byte NOP. */
> - .byte 0x0f,0x1f,0x44,0x00,0x00
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> END2 (__strncasecmp)
> # ifndef NO_NOLOCALE_ALIAS
> weak_alias (__strncasecmp, strncasecmp)
> @@ -146,22 +144,22 @@ ENTRY (STRCMP)
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> .section .rodata.cst16,"aM",@progbits,16
> .align 16
> -.Lbelowupper:
> - .quad 0x4040404040404040
> - .quad 0x4040404040404040
> -.Ltopupper:
> - .quad 0x5b5b5b5b5b5b5b5b
> - .quad 0x5b5b5b5b5b5b5b5b
> -.Ltouppermask:
> +.Llcase_min:
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +.Llcase_max:
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +.Lcase_add:
> .quad 0x2020202020202020
> .quad 0x2020202020202020
> .previous
> - movdqa .Lbelowupper(%rip), %xmm5
> -# define UCLOW_reg %xmm5
> - movdqa .Ltopupper(%rip), %xmm6
> -# define UCHIGH_reg %xmm6
> - movdqa .Ltouppermask(%rip), %xmm7
> -# define LCQWORD_reg %xmm7
> + movdqa .Llcase_min(%rip), %xmm5
> +# define LCASE_MIN_reg %xmm5
> + movdqa .Llcase_max(%rip), %xmm6
> +# define LCASE_MAX_reg %xmm6
> + movdqa .Lcase_add(%rip), %xmm7
> +# define CASE_ADD_reg %xmm7
> #endif
> cmp $0x30, %ecx
> ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
> @@ -172,22 +170,18 @@ ENTRY (STRCMP)
> movhpd 8(%rdi), %xmm1
> movhpd 8(%rsi), %xmm2
> #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define TOLOWER(reg1, reg2) \
> - movdqa reg1, %xmm8; \
> - movdqa UCHIGH_reg, %xmm9; \
> - movdqa reg2, %xmm10; \
> - movdqa UCHIGH_reg, %xmm11; \
> - pcmpgtb UCLOW_reg, %xmm8; \
> - pcmpgtb reg1, %xmm9; \
> - pcmpgtb UCLOW_reg, %xmm10; \
> - pcmpgtb reg2, %xmm11; \
> - pand %xmm9, %xmm8; \
> - pand %xmm11, %xmm10; \
> - pand LCQWORD_reg, %xmm8; \
> - pand LCQWORD_reg, %xmm10; \
> - por %xmm8, reg1; \
> - por %xmm10, reg2
> - TOLOWER (%xmm1, %xmm2)
> +# define TOLOWER(reg1, reg2) \
> + movdqa LCASE_MIN_reg, %xmm8; \
> + movdqa LCASE_MIN_reg, %xmm9; \
> + paddb reg1, %xmm8; \
> + paddb reg2, %xmm9; \
> + pcmpgtb LCASE_MAX_reg, %xmm8; \
> + pcmpgtb LCASE_MAX_reg, %xmm9; \
> + pandn CASE_ADD_reg, %xmm8; \
> + pandn CASE_ADD_reg, %xmm9; \
> + paddb %xmm8, reg1; \
> + paddb %xmm9, reg2
> + TOLOWER (%xmm1, %xmm2)
> #else
> # define TOLOWER(reg1, reg2)
> #endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Slightly faster method of doing TOLOWER that saves an
> > instruction.
> >
> > Also replace the hard coded 5-byte no with .p2align 4. On builds with
> > CET enabled this misaligned entry to strcasecmp.
> >
> > geometric_mean(N=40) of all benchmarks New / Original: .894
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=40 runs; All functions page aligned
> > length, align1, align2, max_char, New Time / Old Time
> > 1, 1, 1, 127, 0.903
> > 2, 2, 2, 127, 0.905
> > 3, 3, 3, 127, 0.877
> > 4, 4, 4, 127, 0.888
> > 5, 5, 5, 127, 0.901
> > 6, 6, 6, 127, 0.954
> > 7, 7, 7, 127, 0.932
> > 8, 0, 0, 127, 0.918
> > 9, 1, 1, 127, 0.914
> > 10, 2, 2, 127, 0.877
> > 11, 3, 3, 127, 0.909
> > 12, 4, 4, 127, 0.876
> > 13, 5, 5, 127, 0.886
> > 14, 6, 6, 127, 0.914
> > 15, 7, 7, 127, 0.939
> > 4, 0, 0, 127, 0.963
> > 4, 0, 0, 254, 0.943
> > 8, 0, 0, 254, 0.927
> > 16, 0, 0, 127, 0.876
> > 16, 0, 0, 254, 0.865
> > 32, 0, 0, 127, 0.865
> > 32, 0, 0, 254, 0.862
> > 64, 0, 0, 127, 0.863
> > 64, 0, 0, 254, 0.896
> > 128, 0, 0, 127, 0.885
> > 128, 0, 0, 254, 0.882
> > 256, 0, 0, 127, 0.87
> > 256, 0, 0, 254, 0.869
> > 512, 0, 0, 127, 0.832
> > 512, 0, 0, 254, 0.848
> > 1024, 0, 0, 127, 0.835
> > 1024, 0, 0, 254, 0.843
> > 16, 1, 2, 127, 0.914
> > 16, 2, 1, 254, 0.949
> > 32, 2, 4, 127, 0.955
> > 32, 4, 2, 254, 1.004
> > 64, 3, 6, 127, 0.844
> > 64, 6, 3, 254, 0.905
> > 128, 4, 0, 127, 0.889
> > 128, 0, 4, 254, 0.845
> > 256, 5, 2, 127, 0.929
> > 256, 2, 5, 254, 0.907
> > 512, 6, 4, 127, 0.837
> > 512, 4, 6, 254, 0.862
> > 1024, 7, 6, 127, 0.895
> > 1024, 6, 7, 254, 0.89
> >
> > sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++----------------------
> > 1 file changed, 29 insertions(+), 35 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> > index e2ab59c555..99d8b36f1d 100644
> > --- a/sysdeps/x86_64/strcmp.S
> > +++ b/sysdeps/x86_64/strcmp.S
> > @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
> > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> > mov %fs:(%rax),%RDX_LP
> >
> > - // XXX 5 byte should be before the function
> > - /* 5-byte NOP. */
> > - .byte 0x0f,0x1f,0x44,0x00,0x00
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > END2 (__strcasecmp)
> > # ifndef NO_NOLOCALE_ALIAS
> > weak_alias (__strcasecmp, strcasecmp)
> > @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
> > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> > mov %fs:(%rax),%RCX_LP
> >
> > - // XXX 5 byte should be before the function
> > - /* 5-byte NOP. */
> > - .byte 0x0f,0x1f,0x44,0x00,0x00
> > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> > + .p2align 4
> > END2 (__strncasecmp)
> > # ifndef NO_NOLOCALE_ALIAS
> > weak_alias (__strncasecmp, strncasecmp)
> > @@ -146,22 +144,22 @@ ENTRY (STRCMP)
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > .section .rodata.cst16,"aM",@progbits,16
> > .align 16
> > -.Lbelowupper:
> > - .quad 0x4040404040404040
> > - .quad 0x4040404040404040
> > -.Ltopupper:
> > - .quad 0x5b5b5b5b5b5b5b5b
> > - .quad 0x5b5b5b5b5b5b5b5b
> > -.Ltouppermask:
> > +.Llcase_min:
> > + .quad 0x3f3f3f3f3f3f3f3f
> > + .quad 0x3f3f3f3f3f3f3f3f
> > +.Llcase_max:
> > + .quad 0x9999999999999999
> > + .quad 0x9999999999999999
> > +.Lcase_add:
> > .quad 0x2020202020202020
> > .quad 0x2020202020202020
> > .previous
> > - movdqa .Lbelowupper(%rip), %xmm5
> > -# define UCLOW_reg %xmm5
> > - movdqa .Ltopupper(%rip), %xmm6
> > -# define UCHIGH_reg %xmm6
> > - movdqa .Ltouppermask(%rip), %xmm7
> > -# define LCQWORD_reg %xmm7
> > + movdqa .Llcase_min(%rip), %xmm5
> > +# define LCASE_MIN_reg %xmm5
> > + movdqa .Llcase_max(%rip), %xmm6
> > +# define LCASE_MAX_reg %xmm6
> > + movdqa .Lcase_add(%rip), %xmm7
> > +# define CASE_ADD_reg %xmm7
> > #endif
> > cmp $0x30, %ecx
> > ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
> > @@ -172,22 +170,18 @@ ENTRY (STRCMP)
> > movhpd 8(%rdi), %xmm1
> > movhpd 8(%rsi), %xmm2
> > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> > -# define TOLOWER(reg1, reg2) \
> > - movdqa reg1, %xmm8; \
> > - movdqa UCHIGH_reg, %xmm9; \
> > - movdqa reg2, %xmm10; \
> > - movdqa UCHIGH_reg, %xmm11; \
> > - pcmpgtb UCLOW_reg, %xmm8; \
> > - pcmpgtb reg1, %xmm9; \
> > - pcmpgtb UCLOW_reg, %xmm10; \
> > - pcmpgtb reg2, %xmm11; \
> > - pand %xmm9, %xmm8; \
> > - pand %xmm11, %xmm10; \
> > - pand LCQWORD_reg, %xmm8; \
> > - pand LCQWORD_reg, %xmm10; \
> > - por %xmm8, reg1; \
> > - por %xmm10, reg2
> > - TOLOWER (%xmm1, %xmm2)
> > +# define TOLOWER(reg1, reg2) \
> > + movdqa LCASE_MIN_reg, %xmm8; \
> > + movdqa LCASE_MIN_reg, %xmm9; \
> > + paddb reg1, %xmm8; \
> > + paddb reg2, %xmm9; \
> > + pcmpgtb LCASE_MAX_reg, %xmm8; \
> > + pcmpgtb LCASE_MAX_reg, %xmm9; \
> > + pandn CASE_ADD_reg, %xmm8; \
> > + pandn CASE_ADD_reg, %xmm9; \
> > + paddb %xmm8, reg1; \
> > + paddb %xmm9, reg2
> > + TOLOWER (%xmm1, %xmm2)
> > #else
> > # define TOLOWER(reg1, reg2)
> > #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
@@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RDX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END2 (__strcasecmp)
# ifndef NO_NOLOCALE_ALIAS
weak_alias (__strcasecmp, strcasecmp)
@@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
mov %fs:(%rax),%RCX_LP
- // XXX 5 byte should be before the function
- /* 5-byte NOP. */
- .byte 0x0f,0x1f,0x44,0x00,0x00
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
END2 (__strncasecmp)
# ifndef NO_NOLOCALE_ALIAS
weak_alias (__strncasecmp, strncasecmp)
@@ -146,22 +144,22 @@ ENTRY (STRCMP)
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
.section .rodata.cst16,"aM",@progbits,16
.align 16
-.Lbelowupper:
- .quad 0x4040404040404040
- .quad 0x4040404040404040
-.Ltopupper:
- .quad 0x5b5b5b5b5b5b5b5b
- .quad 0x5b5b5b5b5b5b5b5b
-.Ltouppermask:
+.Llcase_min:
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+.Lcase_add:
.quad 0x2020202020202020
.quad 0x2020202020202020
.previous
- movdqa .Lbelowupper(%rip), %xmm5
-# define UCLOW_reg %xmm5
- movdqa .Ltopupper(%rip), %xmm6
-# define UCHIGH_reg %xmm6
- movdqa .Ltouppermask(%rip), %xmm7
-# define LCQWORD_reg %xmm7
+ movdqa .Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+ movdqa .Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+ movdqa .Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
#endif
cmp $0x30, %ecx
ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
@@ -172,22 +170,18 @@ ENTRY (STRCMP)
movhpd 8(%rdi), %xmm1
movhpd 8(%rsi), %xmm2
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
- movdqa reg1, %xmm8; \
- movdqa UCHIGH_reg, %xmm9; \
- movdqa reg2, %xmm10; \
- movdqa UCHIGH_reg, %xmm11; \
- pcmpgtb UCLOW_reg, %xmm8; \
- pcmpgtb reg1, %xmm9; \
- pcmpgtb UCLOW_reg, %xmm10; \
- pcmpgtb reg2, %xmm11; \
- pand %xmm9, %xmm8; \
- pand %xmm11, %xmm10; \
- pand LCQWORD_reg, %xmm8; \
- pand LCQWORD_reg, %xmm10; \
- por %xmm8, reg1; \
- por %xmm10, reg2
- TOLOWER (%xmm1, %xmm2)
+# define TOLOWER(reg1, reg2) \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ movdqa LCASE_MIN_reg, %xmm9; \
+ paddb reg1, %xmm8; \
+ paddb reg2, %xmm9; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm9; \
+ pandn CASE_ADD_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm9; \
+ paddb %xmm8, reg1; \
+ paddb %xmm9, reg2
+ TOLOWER (%xmm1, %xmm2)
#else
# define TOLOWER(reg1, reg2)
#endif