[v1,4/6] x86_64: Add sse2 optimized __memcmpeq in memcmp-sse2.S

Message ID 20211027024323.1199441-4-goldstein.w.n@gmail.com
State Committed
Commit fa7f63d8d6a081d59dadcb9986efaafb8093735d
Headers
Series [v1,1/6] String: Add __memcmpeq as build target |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein Oct. 27, 2021, 2:43 a.m. UTC
  No bug. This commit does not modify any of the memcmp
implementation. It just adds __memcmpeq ifdefs to skip obvious cases
where computing the proper 1/-1 required by memcmp is not needed.
---
 sysdeps/x86_64/memcmp.S | 55 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 4 deletions(-)
  

Comments

H.J. Lu Oct. 27, 2021, 12:48 p.m. UTC | #1
On Tue, Oct 26, 2021 at 7:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit does not modify any of the memcmp
> implementation. It just adds __memcmpeq ifdefs to skip obvious cases
> where computing the proper 1/-1 required by memcmp is not needed.
> ---
>  sysdeps/x86_64/memcmp.S | 55 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 51 insertions(+), 4 deletions(-)
>
> diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
> index b53f2c0866..c245383963 100644
> --- a/sysdeps/x86_64/memcmp.S
> +++ b/sysdeps/x86_64/memcmp.S
> @@ -49,34 +49,63 @@ L(s2b):
>         movzwl  (%rdi), %eax
>         movzwl  (%rdi, %rsi), %edx
>         subq    $2, %r10
> +#ifdef USE_AS_MEMCMPEQ
> +       je      L(finz1)
> +#else
>         je      L(fin2_7)
> +#endif
>         addq    $2, %rdi
>         cmpl    %edx, %eax
> +#ifdef USE_AS_MEMCMPEQ
> +       jnz     L(neq_early)
> +#else
>         jnz     L(fin2_7)
> +#endif
>  L(s4b):
>         testq   $4, %r10
>         jz      L(s8b)
>         movl    (%rdi), %eax
>         movl    (%rdi, %rsi), %edx
>         subq    $4, %r10
> +#ifdef USE_AS_MEMCMPEQ
> +       je      L(finz1)
> +#else
>         je      L(fin2_7)
> +#endif
>         addq    $4, %rdi
>         cmpl    %edx, %eax
> +#ifdef USE_AS_MEMCMPEQ
> +       jnz     L(neq_early)
> +#else
>         jnz     L(fin2_7)
> +#endif
>  L(s8b):
>         testq   $8, %r10
>         jz      L(s16b)
>         movq    (%rdi), %rax
>         movq    (%rdi, %rsi), %rdx
>         subq    $8, %r10
> +#ifdef USE_AS_MEMCMPEQ
> +       je      L(sub_return8)
> +#else
>         je      L(fin2_7)
> +#endif
>         addq    $8, %rdi
>         cmpq    %rdx, %rax
> +#ifdef USE_AS_MEMCMPEQ
> +       jnz     L(neq_early)
> +#else
>         jnz     L(fin2_7)
> +#endif
>  L(s16b):
>         movdqu    (%rdi), %xmm1
>         movdqu    (%rdi, %rsi), %xmm0
>         pcmpeqb   %xmm0, %xmm1
> +#ifdef USE_AS_MEMCMPEQ
> +       pmovmskb  %xmm1, %eax
> +       subl      $0xffff, %eax
> +       ret
> +#else
>         pmovmskb  %xmm1, %edx
>         xorl      %eax, %eax
>         subl      $0xffff, %edx
> @@ -86,7 +115,7 @@ L(s16b):
>         movzbl   (%rcx), %eax
>         movzbl   (%rsi, %rcx), %edx
>         jmp      L(finz1)
> -
> +#endif
>         .p2align 4,, 4
>  L(finr1b):
>         movzbl  (%rdi), %eax
> @@ -95,7 +124,15 @@ L(finz1):
>         subl    %edx, %eax
>  L(exit):
>         ret
> -
> +#ifdef USE_AS_MEMCMPEQ
> +       .p2align 4,, 4
> +L(sub_return8):
> +       subq    %rdx, %rax
> +       movl    %eax, %edx
> +       shrq    $32, %rax
> +       orl     %edx, %eax
> +       ret
> +#else
>         .p2align 4,, 4
>  L(fin2_7):
>         cmpq    %rdx, %rax
> @@ -111,12 +148,17 @@ L(fin2_7):
>         movzbl  %dl, %edx
>         subl    %edx, %eax
>         ret
> -
> +#endif
>         .p2align 4,, 4
>  L(finz):
>         xorl    %eax, %eax
>         ret
> -
> +#ifdef USE_AS_MEMCMPEQ
> +       .p2align 4,, 4
> +L(neq_early):
> +       movl    $1, %eax
> +       ret
> +#endif
>         /* For blocks bigger than 32 bytes
>            1. Advance one of the addr pointer to be 16B aligned.
>            2. Treat the case of both addr pointers aligned to 16B
> @@ -246,11 +288,16 @@ L(mt16):
>
>         .p2align 4,, 4
>  L(neq):
> +#ifdef USE_AS_MEMCMPEQ
> +       movl    $1, %eax
> +    ret
> +#else
>         bsfl      %edx, %ecx
>         movzbl   (%rdi, %rcx), %eax
>         addq     %rdi, %rsi
>         movzbl   (%rsi,%rcx), %edx
>         jmp      L(finz1)
> +#endif
>
>         .p2align 4,, 4
>  L(ATR):
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  

Patch

diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index b53f2c0866..c245383963 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -49,34 +49,63 @@  L(s2b):
 	movzwl	(%rdi),	%eax
 	movzwl	(%rdi, %rsi), %edx
 	subq    $2, %r10
+#ifdef USE_AS_MEMCMPEQ
+	je	L(finz1)
+#else
 	je	L(fin2_7)
+#endif
 	addq	$2, %rdi
 	cmpl	%edx, %eax
+#ifdef USE_AS_MEMCMPEQ
+	jnz	L(neq_early)
+#else
 	jnz	L(fin2_7)
+#endif
 L(s4b):
 	testq	$4, %r10
 	jz	L(s8b)
 	movl	(%rdi),	%eax
 	movl	(%rdi, %rsi), %edx
 	subq    $4, %r10
+#ifdef USE_AS_MEMCMPEQ
+	je	L(finz1)
+#else
 	je	L(fin2_7)
+#endif
 	addq	$4, %rdi
 	cmpl	%edx, %eax
+#ifdef USE_AS_MEMCMPEQ
+	jnz	L(neq_early)
+#else
 	jnz	L(fin2_7)
+#endif
 L(s8b):
 	testq	$8, %r10
 	jz	L(s16b)
 	movq	(%rdi),	%rax
 	movq	(%rdi, %rsi), %rdx
 	subq    $8, %r10
+#ifdef USE_AS_MEMCMPEQ
+	je	L(sub_return8)
+#else
 	je	L(fin2_7)
+#endif
 	addq	$8, %rdi
 	cmpq	%rdx, %rax
+#ifdef USE_AS_MEMCMPEQ
+	jnz	L(neq_early)
+#else
 	jnz	L(fin2_7)
+#endif
 L(s16b):
 	movdqu    (%rdi), %xmm1
 	movdqu    (%rdi, %rsi), %xmm0
 	pcmpeqb   %xmm0, %xmm1
+#ifdef USE_AS_MEMCMPEQ
+	pmovmskb  %xmm1, %eax
+	subl      $0xffff, %eax
+	ret
+#else
 	pmovmskb  %xmm1, %edx
 	xorl	  %eax, %eax
 	subl      $0xffff, %edx
@@ -86,7 +115,7 @@  L(s16b):
 	movzbl	 (%rcx), %eax
 	movzbl	 (%rsi, %rcx), %edx
 	jmp	 L(finz1)
-
+#endif
 	.p2align 4,, 4
 L(finr1b):
 	movzbl	(%rdi), %eax
@@ -95,7 +124,15 @@  L(finz1):
 	subl	%edx, %eax
 L(exit):
 	ret
-
+#ifdef USE_AS_MEMCMPEQ
+	.p2align 4,, 4
+L(sub_return8):
+	subq	%rdx, %rax
+	movl	%eax, %edx
+	shrq	$32, %rax
+	orl	%edx, %eax
+	ret
+#else
 	.p2align 4,, 4
 L(fin2_7):
 	cmpq	%rdx, %rax
@@ -111,12 +148,17 @@  L(fin2_7):
 	movzbl  %dl, %edx
 	subl	%edx, %eax
 	ret
-
+#endif
 	.p2align 4,, 4
 L(finz):
 	xorl	%eax, %eax
 	ret
-
+#ifdef USE_AS_MEMCMPEQ
+	.p2align 4,, 4
+L(neq_early):
+	movl	$1, %eax
+	ret
+#endif
 	/* For blocks bigger than 32 bytes
 	   1. Advance one of the addr pointer to be 16B aligned.
 	   2. Treat the case of both addr pointers aligned to 16B
@@ -246,11 +288,16 @@  L(mt16):
 
 	.p2align 4,, 4
 L(neq):
+#ifdef USE_AS_MEMCMPEQ
+	movl	$1, %eax
+    ret
+#else
 	bsfl      %edx, %ecx
 	movzbl	 (%rdi, %rcx), %eax
 	addq	 %rdi, %rsi
 	movzbl	 (%rsi,%rcx), %edx
 	jmp	 L(finz1)
+#endif
 
 	.p2align 4,, 4
 L(ATR):