[RFC] Optimize memset with conditional moves.

Message ID 20150827162836.GA28702@domone
State New, archived
Headers

Commit Message

Ondrej Bilka Aug. 27, 2015, 4:28 p.m. UTC
  Hi,

I got idea that I mentioned on aarch thread to improve memset and memcpy
performance by using conditional moves to avoid branch misprediction
penalty. Previously setting one byte was very slow as we needed decision
tree to determine implementation for each size and for some range it
needed to be slow so we selected this one as unlikely.

A trick that I use could be done in C to handle n less than 16

char stack[64];
uint64_t *p1 = (uint64_t *) s;
uint32_t *p2 = (uint32_t *) (s + (n & 8));
uint16_t *p3 = (uint16_t *) (s + (n & 12));
char *p4 = s + n - 1;
if (n & 8 == 0) p1 = stack;
if (n & 4 == 0) p2 = stack;
if (n & 2 == 0) p3 = stack;
if (n & 1 == 0) p4 = stack;
*p1 = vc;
*p2 = vc;
*p3 = vc;
*p4 = vc;

and for 32-128 byte range its same as what in 16-64 range you could do
with 8 byte moves

char stack[64];
int add = n > 32 ? 16 : 0;
int sub = n > 32 ? -16 : 0;
*((uint64_t *)(s)) = vc;
*((uint64_t *)(s + 8)) = vc;
*((uint64_t *)(s + add)) = vc;
*((uint64_t *)(s + 8 + add)) = vc;
*((uint64_t *)(s + n - 16)) = vc;
*((uint64_t *)(s + 8 + n - 16)) = vc;
*((uint64_t *)(s + n - 16 + sub)) = vc;
*((uint64_t *)(s + 8 + n - 16 + sub)) = vc;

This could be used generically, I believe that on arm it could give
considerable speedup but would need access to machine for testing.

This gives on average 5% over current implementation, see profile
results here, I will resend alternate rep stosq implementation later,
see following:

http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov270815.tar.bz2

That as I recall was where I left memset as I didn't know on which sizes
to select it as it needs to be set empirically and is different for each
architecture. Problem is that on most architectures its faster when data
are in L3 cache or main memory and I dont know since what sizes it
usually happens as it may be case in lot smaller size than cache size
limit.

I also included optimization of counted loop, which helps processor to
correctly predict end until 512 bytes.

Second one is that I extended header to sizes upto 256 bytes which gives
additional one percent or so.


There is additional microoptimization possible that I could save few
cycles with ssse3 pshufb so should I add another variant there
surrounded by standard #ifdef USE_SSSE3 pattern?

I didn't do more performance testing yet as I plan to use this pattern
also for memcpy and strcpy.

So comments?

	* sysdeps/x86_64/memset.S: Use conditional moves to improve
	performance.
  

Comments

H.J. Lu Aug. 27, 2015, 4:54 p.m. UTC | #1
On Thu, Aug 27, 2015 at 9:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> Hi,
>
> I got idea that I mentioned on aarch thread to improve memset and memcpy
> performance by using conditional moves to avoid branch misprediction
> penalty. Previously setting one byte was very slow as we needed decision
> tree to determine implementation for each size and for some range it
> needed to be slow so we selected this one as unlikely.
>
> A trick that I use could be done in C to handle n less than 16
>
> char stack[64];
> uint64_t *p1 = (uint64_t *) s;
> uint32_t *p2 = (uint32_t *) (s + (n & 8));
> uint16_t *p3 = (uint16_t *) (s + (n & 12));
> char *p4 = s + n - 1;
> if (n & 8 == 0) p1 = stack;
> if (n & 4 == 0) p2 = stack;
> if (n & 2 == 0) p3 = stack;
> if (n & 1 == 0) p4 = stack;
> *p1 = vc;
> *p2 = vc;
> *p3 = vc;
> *p4 = vc;
>
> and for 32-128 byte range its same as what in 16-64 range you could do
> with 8 byte moves
>
> char stack[64];
> int add = n > 32 ? 16 : 0;
> int sub = n > 32 ? -16 : 0;
> *((uint64_t *)(s)) = vc;
> *((uint64_t *)(s + 8)) = vc;
> *((uint64_t *)(s + add)) = vc;
> *((uint64_t *)(s + 8 + add)) = vc;
> *((uint64_t *)(s + n - 16)) = vc;
> *((uint64_t *)(s + 8 + n - 16)) = vc;
> *((uint64_t *)(s + n - 16 + sub)) = vc;
> *((uint64_t *)(s + 8 + n - 16 + sub)) = vc;
>
> This could be used generically, I believe that on arm it could give
> considerable speedup but would need access to machine for testing.
>
> This gives on average 5% over current implementation, see profile
> results here, I will resend alternate rep stosq implementation later,
> see following:
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov.html
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_cmov270815.tar.bz2
>
> That as I recall was where I left memset as I didn't know on which sizes
> to select it as it needs to be set empirically and is different for each
> architecture. Problem is that on most architectures its faster when data
> are in L3 cache or main memory and I dont know since what sizes it
> usually happens as it may be case in lot smaller size than cache size
> limit.
>
> I also included optimization of counted loop, which helps processor to
> correctly predict end until 512 bytes.
>
> Second one is that I extended header to sizes upto 256 bytes which gives
> additional one percent or so.
>
>
> There is additional microoptimization possible that I could save few
> cycles with ssse3 pshufb so should I add another variant there
> surrounded by standard #ifdef USE_SSSE3 pattern?
>
> I didn't do more performance testing yet as I plan to use this pattern
> also for memcpy and strcpy.
>
> So comments?
>
>         * sysdeps/x86_64/memset.S: Use conditional moves to improve
>         performance.
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index e496254..bea402d 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -24,7 +24,7 @@
>  ENTRY(__bzero)
>         movq    %rdi, %rax /* Set return value.  */
>         movq    %rsi, %rdx /* Set n.  */
> -       pxor    %xmm8, %xmm8
> +       pxor    %xmm0, %xmm0

The  %xmm8->%xmm0 changes are in master branch now.

>         jmp     L(entry_from_bzero)
>  END(__bzero)
>  weak_alias (__bzero, bzero)
> @@ -33,10 +33,10 @@ weak_alias (__bzero, bzero)
>  ENTRY(__memset_tail)
>         movq    %rcx, %rax /* Set return value.  */
>
> -       movd    %esi, %xmm8
> -       punpcklbw       %xmm8, %xmm8
> -       punpcklwd       %xmm8, %xmm8
> -       pshufd  $0, %xmm8, %xmm8
> +       movd    %esi, %xmm0
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
>
>         jmp     L(entry_from_bzero)
>  END(__memset_tail)
> @@ -50,78 +50,116 @@ END_CHK (__memset_chk)
>  #endif
>
>  ENTRY (memset)
> -       movd    %esi, %xmm8
> +       movd    %esi, %xmm0
>         movq    %rdi, %rax
> -       punpcklbw       %xmm8, %xmm8
> -       punpcklwd       %xmm8, %xmm8
> -       pshufd  $0, %xmm8, %xmm8
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
>  L(entry_from_bzero):
> -       cmpq    $64, %rdx
> +       cmpq    $256, %rdx
>         ja      L(loop_start)
> -       cmpq    $16, %rdx
> -       jbe     L(less_16_bytes)
> -       cmpq    $32, %rdx
> -       movdqu  %xmm8, (%rdi)
> -       movdqu  %xmm8, -16(%rdi,%rdx)
> -       ja      L(between_32_64_bytes)
> -L(return):
> -       rep
> +       cmp     $128, %edx
> +       jae     L(between_128_256_bytes)
> +
> +       cmp     $32, %edx
> +       jb      L(less_32_bytes)
> +
> +       lea     -32(%rdi), %rcx
> +       lea     32(%rdi), %rsi
> +
> +       cmp     $64, %edx
> +       cmovb   %rdi, %rcx
> +       cmovb   %rdi, %rsi
> +
> +       movdqu  %xmm0, (%rdi)
> +       movdqu  %xmm0, 16(%rdi)
> +       movdqu  %xmm0, (%rsi)
> +       movdqu  %xmm0, 16(%rsi)
> +
> +       movdqu  %xmm0, -32(%rcx, %rdx)
> +       movdqu  %xmm0, -16(%rcx, %rdx)
> +       movdqu  %xmm0, -32(%rdi, %rdx)
> +       movdqu  %xmm0, -16(%rdi, %rdx)
>         ret
> +
>         .p2align 4
> -L(between_32_64_bytes):
> -       movdqu  %xmm8, 16(%rdi)
> -       movdqu  %xmm8, -32(%rdi,%rdx)
> +L(between_128_256_bytes):
> +       lea     (%rdi, %rdx), %rsi
> +       movdqu  %xmm0, (%rdi)
> +       movdqu  %xmm0, 16(%rdi)
> +       movdqu  %xmm0, 32(%rdi)
> +       movdqu  %xmm0, 48(%rdi)
> +       movdqu  %xmm0, 64(%rdi)
> +       movdqu  %xmm0, 80(%rdi)
> +       movdqu  %xmm0, 96(%rdi)
> +       movdqu  %xmm0, 112(%rdi)
> +       movdqu  %xmm0, -128(%rsi)
> +       movdqu  %xmm0, -112(%rsi)
> +       movdqu  %xmm0, -96(%rsi)
> +       movdqu  %xmm0, -80(%rsi)
> +       movdqu  %xmm0, -64(%rsi)
> +       movdqu  %xmm0, -48(%rsi)
> +       movdqu  %xmm0, -32(%rsi)
> +       movdqu  %xmm0, -16(%rsi)
>         ret
> +
>         .p2align 4
>  L(loop_start):
> -       leaq    64(%rdi), %rcx
> -       movdqu  %xmm8, (%rdi)
> -       andq    $-64, %rcx
> -       movdqu  %xmm8, -16(%rdi,%rdx)
> -       movdqu  %xmm8, 16(%rdi)
> -       movdqu  %xmm8, -32(%rdi,%rdx)
> -       movdqu  %xmm8, 32(%rdi)
> -       movdqu  %xmm8, -48(%rdi,%rdx)
> -       movdqu  %xmm8, 48(%rdi)
> -       movdqu  %xmm8, -64(%rdi,%rdx)
> -       addq    %rdi, %rdx
> -       andq    $-64, %rdx
> -       cmpq    %rdx, %rcx
> -       je      L(return)
> -       .p2align 4
> +       lea     (%rdi, %rdx), %rsi
> +       leaq    16(%rdi), %rcx
> +       andq    $-16, %rcx
> +       movdqu  %xmm0, -16(%rsi)
> +       movdqu  %xmm0, -32(%rsi)
> +       movdqu  %xmm0, -48(%rsi)
> +       movdqu  %xmm0, -64(%rsi)
> +       movdqu  %xmm0, (%rdi)
> +       subq    %rcx, %rsi
> +       shrq    $6, %rsi
>  L(loop):
> -       movdqa  %xmm8, (%rcx)
> -       movdqa  %xmm8, 16(%rcx)
> -       movdqa  %xmm8, 32(%rcx)
> -       movdqa  %xmm8, 48(%rcx)
> +       movdqa  %xmm0, (%rcx)
> +       movdqa  %xmm0, 16(%rcx)
> +       movdqa  %xmm0, 32(%rcx)
> +       movdqa  %xmm0, 48(%rcx)
>         addq    $64, %rcx
> -       cmpq    %rcx, %rdx
> +       subq    $1, %rsi
>         jne     L(loop)
> -       rep
> -       ret
> -L(less_16_bytes):
> -       movq %xmm8, %rcx
> -       testb   $24, %dl
> -       jne     L(between8_16bytes)
> -       testb   $4, %dl
> -       jne     L(between4_7bytes)
> -       testb   $1, %dl
> -       je      L(odd_byte)
> -       movb    %cl, (%rdi)
> -L(odd_byte):
> -       testb   $2, %dl
> -       je      L(return)
> -       movw    %cx, -2(%rax,%rdx)
> -       ret
> -L(between4_7bytes):
> -       movl    %ecx, (%rdi)
> -       movl    %ecx, -4(%rdi,%rdx)
> -       ret
> -L(between8_16bytes):
> -       movq    %rcx, (%rdi)
> -       movq    %rcx, -8(%rdi,%rdx)
>         ret
>
> +L(less_32_bytes):
> +       movq    %xmm0, %rcx
> +       lea     -64(%rsp), %r10
> +       sub     %rax, %r10
> +       lea     64(%rdi), %r9
> +       lea     63(%rdi, %rdx), %rsi
> +       mov     %rdx, %r8
> +       mov     %rdx, %r11
> +       mov     %rdx, %rdi
> +       and     $16, %r8
> +       and     $24, %rdi
> +       and     $28, %r11
> +
> +       test    $16, %edx
> +       cmove   %rsp, %r9
> +
> +       test    $8, %edx
> +       cmove   %r10, %r8
> +
> +       test    $4, %edx
> +       cmove   %r10, %rdi
> +
> +       test    $2, %edx
> +       cmove   %r10, %r11
> +
> +       test    $1, %edx
> +       cmove   %rsp, %rsi
> +
> +       movdqu  %xmm0, -64(%r9)
> +       movq    %rcx, (%rax, %r8)
> +       movl    %ecx, (%rax, %rdi)
> +       movw    %cx, (%rax, %r11)
> +       movb    %cl, -64(%rsi)
> +
> +       ret
>  END (memset)
>  libc_hidden_builtin_def (memset)
>

They look OK to me.

Thanks.
  

Patch

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index e496254..bea402d 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -24,7 +24,7 @@ 
 ENTRY(__bzero)
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	jmp	L(entry_from_bzero)
 END(__bzero)
 weak_alias (__bzero, bzero)
@@ -33,10 +33,10 @@  weak_alias (__bzero, bzero)
 ENTRY(__memset_tail)
 	movq	%rcx, %rax /* Set return value.  */
 
-	movd	%esi, %xmm8
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	movd	%esi, %xmm0
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 
 	jmp	L(entry_from_bzero)
 END(__memset_tail)
@@ -50,78 +50,116 @@  END_CHK (__memset_chk)
 #endif
 
 ENTRY (memset)
-	movd	%esi, %xmm8
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 L(entry_from_bzero):
-	cmpq	$64, %rdx
+	cmpq	$256, %rdx
 	ja	L(loop_start)
-	cmpq	$16, %rdx
-	jbe	L(less_16_bytes)
-	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	ja	L(between_32_64_bytes)
-L(return):
-	rep
+	cmp	$128, %edx
+	jae	L(between_128_256_bytes)
+
+	cmp	$32, %edx
+	jb	L(less_32_bytes)
+
+	lea	-32(%rdi), %rcx
+	lea	32(%rdi), %rsi
+
+	cmp	$64, %edx
+	cmovb	%rdi, %rcx
+	cmovb	%rdi, %rsi
+
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, (%rsi)
+	movdqu	%xmm0, 16(%rsi)
+
+	movdqu	%xmm0, -32(%rcx, %rdx)
+	movdqu	%xmm0, -16(%rcx, %rdx)
+	movdqu	%xmm0, -32(%rdi, %rdx)
+	movdqu	%xmm0, -16(%rdi, %rdx)
 	ret
+
 	.p2align 4
-L(between_32_64_bytes):
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
+L(between_128_256_bytes):
+	lea	(%rdi, %rdx), %rsi
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, 64(%rdi)
+	movdqu	%xmm0, 80(%rdi)
+	movdqu	%xmm0, 96(%rdi)
+	movdqu	%xmm0, 112(%rdi)
+	movdqu	%xmm0, -128(%rsi)
+	movdqu	%xmm0, -112(%rsi)
+	movdqu	%xmm0, -96(%rsi)
+	movdqu	%xmm0, -80(%rsi)
+	movdqu	%xmm0, -64(%rsi)
+	movdqu	%xmm0, -48(%rsi)
+	movdqu	%xmm0, -32(%rsi)
+	movdqu	%xmm0, -16(%rsi)
 	ret
+
 	.p2align 4
 L(loop_start):
-	leaq	64(%rdi), %rcx
-	movdqu	%xmm8, (%rdi)
-	andq	$-64, %rcx
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	%xmm8, -64(%rdi,%rdx)
-	addq	%rdi, %rdx
-	andq	$-64, %rdx
-	cmpq	%rdx, %rcx
-	je	L(return)
-	.p2align 4
+	lea	(%rdi, %rdx), %rsi
+	leaq	16(%rdi), %rcx
+	andq	$-16, %rcx
+	movdqu	%xmm0, -16(%rsi)
+	movdqu	%xmm0, -32(%rsi)
+	movdqu	%xmm0, -48(%rsi)
+	movdqu	%xmm0, -64(%rsi)
+	movdqu	%xmm0, (%rdi)
+	subq	%rcx, %rsi
+	shrq	$6, %rsi
 L(loop):
-	movdqa	%xmm8, (%rcx)
-	movdqa	%xmm8, 16(%rcx)
-	movdqa	%xmm8, 32(%rcx)
-	movdqa	%xmm8, 48(%rcx)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
 	addq	$64, %rcx
-	cmpq	%rcx, %rdx
+	subq	$1, %rsi
 	jne	L(loop)
-	rep
-	ret
-L(less_16_bytes):
-	movq %xmm8, %rcx
-	testb	$24, %dl
-	jne	L(between8_16bytes)
-	testb	$4, %dl
-	jne	L(between4_7bytes)
-	testb	$1, %dl
-	je	L(odd_byte)
-	movb	%cl, (%rdi)
-L(odd_byte):
-	testb	$2, %dl
-	je	L(return)
-	movw	%cx, -2(%rax,%rdx)
-	ret
-L(between4_7bytes):
-	movl	%ecx, (%rdi)
-	movl	%ecx, -4(%rdi,%rdx)
-	ret
-L(between8_16bytes):
-	movq	%rcx, (%rdi)
-	movq	%rcx, -8(%rdi,%rdx)
 	ret
 
+L(less_32_bytes):
+	movq	%xmm0, %rcx
+	lea	-64(%rsp), %r10
+	sub	%rax, %r10
+	lea	64(%rdi), %r9
+	lea	63(%rdi, %rdx), %rsi
+	mov	%rdx, %r8
+	mov	%rdx, %r11
+	mov	%rdx, %rdi
+	and	$16, %r8
+	and	$24, %rdi
+	and	$28, %r11
+
+	test	$16, %edx
+	cmove	%rsp, %r9
+
+	test	$8, %edx
+	cmove	%r10, %r8
+
+	test	$4, %edx
+	cmove	%r10, %rdi
+
+	test	$2, %edx
+	cmove	%r10, %r11
+
+	test	$1, %edx
+	cmove	%rsp, %rsi
+
+	movdqu	%xmm0, -64(%r9)
+	movq	%rcx, (%rax, %r8)
+	movl	%ecx, (%rax, %rdi)
+	movw	%cx, (%rax, %r11)
+	movb	%cl, -64(%rsi)
+
+	ret
 END (memset)
 libc_hidden_builtin_def (memset)