[v3,2/5] AArch64: Improve A64FX memset

Message ID VE1PR08MB559938483A0630B16E8C64B383E49@VE1PR08MB5599.eurprd08.prod.outlook.com
State Superseded
Headers
Series [v3,1/5] AArch64: Improve A64FX memset |

Commit Message

Wilco Dijkstra July 22, 2021, 4 p.m. UTC
  Improve performance of large memsets. Simplify alignment code. For zero memset use DC ZVA,
which almost doubles performance. For non-zero memsets use the unroll8 loop which is about 10% faster.

---
  

Comments

develop--- via Libc-alpha Aug. 2, 2021, 1:29 p.m. UTC | #1
Hi Wilco,

Thank you for the patch.

I confirmed that the performance is improved than the master as shown
in the graphs [1][2].

Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
Tested-by: Naohiro Tamura <naohirot@fujitsu.com>

[1] https://drive.google.com/file/d/1RxdIlJa2Wvl8eT5_TRVkvbkyS6bfZObx/view?usp=sharing
[2] https://drive.google.com/file/d/1xCLsa7qweovdQpWtfnNZZwcEi7W7z3Ok/view?usp=sharing

There are one comment and one question below.


> Subject: [PATCH v3 2/5] AArch64: Improve A64FX memset

How about "AArch64: Improve A64FX memset for more than 8MB"?

> 
> Improve performance of large memsets. Simplify alignment code. For zero memset use DC ZVA,
> which almost doubles performance. For non-zero memsets use the unroll8 loop which is about 10% faster.
> 
> ---
> 
> diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644
> --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> @@ -30,10 +30,8 @@
>  #define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
>  #define CACHE_LINE_SIZE	256
>  #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
> -#define rest		x8
> +#define rest		x2
>  #define vector_length	x9
> -#define vl_remainder	x10	// vector_length remainder
> -#define cl_remainder	x11	// CACHE_LINE_SIZE remainder
> 
>  #if HAVE_AARCH64_SVE_ASM
>  # if IS_IN (libc)
> @@ -41,14 +39,6 @@
> 
>  	.arch armv8.2-a+sve
> 
> -	.macro dc_zva times
> -	dc	zva, tmp1
> -	add	tmp1, tmp1, CACHE_LINE_SIZE
> -	.if \times-1
> -	dc_zva "(\times-1)"
> -	.endif
> -	.endm
> -
>  	.macro st1b_unroll first=0, last=7
>  	st1b	z0.b, p0, [dst, \first, mul vl]
>  	.if \last-\first
> @@ -187,54 +177,29 @@ L(L1_prefetch): // if rest >= L1_SIZE
>  	cbnz	rest, L(unroll32)
>  	ret
> 
> +	// count >= L2_SIZE
>  L(L2):
> -	// align dst address at vector_length byte boundary
> -	sub	tmp1, vector_length, 1
> -	ands	tmp2, dst, tmp1
> -	// if vl_remainder == 0
> -	b.eq	1f
> -	sub	vl_remainder, vector_length, tmp2
> -	// process remainder until the first vector_length boundary
> -	whilelt	p2.b, xzr, vl_remainder
> -	st1b	z0.b, p2, [dst]
> -	add	dst, dst, vl_remainder
> -	sub	rest, rest, vl_remainder
> -	// align dstin address at CACHE_LINE_SIZE byte boundary
> -1:	mov	tmp1, CACHE_LINE_SIZE
> -	ands	tmp2, dst, CACHE_LINE_SIZE - 1
> -	// if cl_remainder == 0
> -	b.eq	L(L2_dc_zva)
> -	sub	cl_remainder, tmp1, tmp2
> -	// process remainder until the first CACHE_LINE_SIZE boundary
> -	mov	tmp1, xzr       // index
> -2:	whilelt	p2.b, tmp1, cl_remainder
> -	st1b	z0.b, p2, [dst, tmp1]
> -	incb	tmp1
> -	cmp	tmp1, cl_remainder
> -	b.lo	2b
> -	add	dst, dst, cl_remainder
> -	sub	rest, rest, cl_remainder
> -
> -L(L2_dc_zva):
> -	// zero fill
> -	mov	tmp1, dst
> -	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
> -	mov	zva_len, ZF_DIST
> -	add	tmp1, zva_len, CACHE_LINE_SIZE * 2
> -	// unroll
> -	.p2align 3
> -1:	st1b_unroll 0, 3
> -	add	tmp2, dst, zva_len
> -	dc	 zva, tmp2
> -	st1b_unroll 4, 7
> -	add	tmp2, tmp2, CACHE_LINE_SIZE
> -	dc	zva, tmp2
> -	add	dst, dst, CACHE_LINE_SIZE * 2
> -	sub	rest, rest, CACHE_LINE_SIZE * 2
> -	cmp	rest, tmp1	// ZF_DIST + CACHE_LINE_SIZE * 2
> -	b.ge	1b
> -	cbnz	rest, L(unroll8)
> -	ret
> +	tst	valw, 255
> +	b.ne	L(unroll8)
> +        // align dst to CACHE_LINE_SIZE byte boundary
> +	and	tmp2, dst, CACHE_LINE_SIZE - 1
> +	sub	tmp2, tmp2, CACHE_LINE_SIZE
> +	st1b	z0.b, p0, [dst, 0, mul vl]
> +	st1b	z0.b, p0, [dst, 1, mul vl]
> +	st1b	z0.b, p0, [dst, 2, mul vl]
> +	st1b	z0.b, p0, [dst, 3, mul vl]
> +	sub	dst, dst, tmp2
> +	add	count, count, tmp2
> +
> +	// clear cachelines using DC ZVA
> +	sub	count, count, CACHE_LINE_SIZE
> +	.p2align 4
> +1:	dc	zva, dst

DC ZVA is called if buffer size is more than 8MB and fill data is zero.
In case of __memset_generic, DC ZVA is called if buffer size is more
than 256B and fill data is zero. This is implemented by you[3].

V3 Patch 02 __memset_a64fx (green line) recorded very close
performance to __memset_generic (red line) in terms of zerofill [4].
V3 Patch 05 __memset_a64fx (green line) recorded almost same
performance as __memset_generic (red line) in terms of zerofill [5].

Graphs[4][5] X axis starts from 256B to 64MB, and are created by the
following command.
$ cat bench-memset-zerofill.out | \
> jq -r 'del(.functions.memset.results[] | select(.char2 != 0))' | \
> plot_strings.py -l -p thru -v -

So DC ZVA implementations for __memset_generic and __memset_a64fx seem
appropriate respectively.

But comparing nonzero fill graph[6] with zero fill graph[4],
why DC ZVA is only effective more than 8MB for __memset_a64fx in spite
that DC ZVA is effective from smaller size for __memset_generic?

Still I couldn't understand DC ZVA behavior.

[3] https://sourceware.org/git/?p=glibc.git;a=commit;f=sysdeps/aarch64/memset.S;h=a8c5a2a9521e105da6e96eaf4029b8e4d595e4f5
[4] https://drive.google.com/file/d/1f0_sTiujCcEZTfxbQ1UZdVwAvbMbP2ii/view?usp=sharing
[5] https://drive.google.com/file/d/1Wyp3GO-9ipcphwqOQOQ9a97EwFz90SPc/view?usp=sharing
[6] https://drive.google.com/file/d/1nZ_lfj6Kz5vFCR35O0q929SceUP-wbih/view?usp=sharing

Thanks.
Naohiro

> +	add	dst, dst, CACHE_LINE_SIZE
> +	subs	count, count, CACHE_LINE_SIZE
> +	b.hi	1b
> +	add	count, count, CACHE_LINE_SIZE
> +	b	L(last)
> 
>  END (MEMSET)
>  libc_hidden_builtin_def (MEMSET)
  
develop--- via Libc-alpha Aug. 3, 2021, 3:08 a.m. UTC | #2
Hi Wilco,

I found my typo in the original code comment.
Would you fix it with the following?
> > -#define ZF_DIST                (CACHE_LINE_SIZE * 21)  // Zerofill distance

> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Monday, August 2, 2021 10:29 PM
> > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644
> > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> > @@ -30,10 +30,8 @@
> >  #define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB

Wrong: // L2 8MB - 1MB
Right: // L2 8MB

> >  #define CACHE_LINE_SIZE	256
> >  #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
> > -#define rest		x8
> > +#define rest		x2
> >  #define vector_length	x9
> > -#define vl_remainder	x10	// vector_length remainder
> > -#define cl_remainder	x11	// CACHE_LINE_SIZE remainder
> >

Thanks.
Naohiro
  
develop--- via Libc-alpha Aug. 3, 2021, 5:03 a.m. UTC | #3
Hi Wilco,

Sorry, I forgot to mention one thing about readability matter.

> From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Sent: Monday, August 2, 2021 10:29 PM

> > +	// count >= L2_SIZE
> >  L(L2):
> > -	// align dst address at vector_length byte boundary
> > -	sub	tmp1, vector_length, 1
> > -	ands	tmp2, dst, tmp1
> > -	// if vl_remainder == 0
> > -	b.eq	1f
> > -	sub	vl_remainder, vector_length, tmp2
> > -	// process remainder until the first vector_length boundary
> > -	whilelt	p2.b, xzr, vl_remainder
> > -	st1b	z0.b, p2, [dst]
> > -	add	dst, dst, vl_remainder
> > -	sub	rest, rest, vl_remainder
> > -	// align dstin address at CACHE_LINE_SIZE byte boundary
> > -1:	mov	tmp1, CACHE_LINE_SIZE
> > -	ands	tmp2, dst, CACHE_LINE_SIZE - 1
> > -	// if cl_remainder == 0
> > -	b.eq	L(L2_dc_zva)
> > -	sub	cl_remainder, tmp1, tmp2
> > -	// process remainder until the first CACHE_LINE_SIZE boundary
> > -	mov	tmp1, xzr       // index
> > -2:	whilelt	p2.b, tmp1, cl_remainder
> > -	st1b	z0.b, p2, [dst, tmp1]
> > -	incb	tmp1
> > -	cmp	tmp1, cl_remainder
> > -	b.lo	2b
> > -	add	dst, dst, cl_remainder
> > -	sub	rest, rest, cl_remainder
> > -
> > -L(L2_dc_zva):
> > -	// zero fill
> > -	mov	tmp1, dst
> > -	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
> > -	mov	zva_len, ZF_DIST
> > -	add	tmp1, zva_len, CACHE_LINE_SIZE * 2
> > -	// unroll
> > -	.p2align 3
> > -1:	st1b_unroll 0, 3
> > -	add	tmp2, dst, zva_len
> > -	dc	 zva, tmp2
> > -	st1b_unroll 4, 7
> > -	add	tmp2, tmp2, CACHE_LINE_SIZE
> > -	dc	zva, tmp2
> > -	add	dst, dst, CACHE_LINE_SIZE * 2
> > -	sub	rest, rest, CACHE_LINE_SIZE * 2
> > -	cmp	rest, tmp1	// ZF_DIST + CACHE_LINE_SIZE * 2
> > -	b.ge	1b
> > -	cbnz	rest, L(unroll8)
> > -	ret
> > +	tst	valw, 255
> > +	b.ne	L(unroll8)
> > +        // align dst to CACHE_LINE_SIZE byte boundary
> > +	and	tmp2, dst, CACHE_LINE_SIZE - 1
> > +	sub	tmp2, tmp2, CACHE_LINE_SIZE

"tmp2" becomes always minus value.
I felt that it would be easier to understand and natural if it is reversed like this:

	sub	tmp2, CACHE_LINE_SIZE, tmp2

> > +	st1b	z0.b, p0, [dst, 0, mul vl]
> > +	st1b	z0.b, p0, [dst, 1, mul vl]
> > +	st1b	z0.b, p0, [dst, 2, mul vl]
> > +	st1b	z0.b, p0, [dst, 3, mul vl]
> > +	sub	dst, dst, tmp2

"dst" needs to be incremented.
Actually "dst" is incremented by "sub" because "tmp2" is minus value.
So it would become natural if "tmp2" is plus value like this:

	add	dst, dst, tmp2

> > +	add	count, count, tmp2

"count" needs to be decremented.
Actually "count" is decremented by "add" because "tmp2" is minus value.
So it would become natural if tmp2 is plus value like this:

	sub count, count, tmp2

Thanks.
Naohiro
  
Wilco Dijkstra Aug. 9, 2021, 4:16 p.m. UTC | #4
Hi Naohiro,

> > +        // align dst to CACHE_LINE_SIZE byte boundary
> > +   and     tmp2, dst, CACHE_LINE_SIZE - 1
> > +   sub     tmp2, tmp2, CACHE_LINE_SIZE

> "tmp2" becomes always minus value.
> I felt that it would be easier to understand and natural if it is reversed like this:
>
>       sub     tmp2, CACHE_LINE_SIZE, tmp2

That's not a valid instruction though. I've just removed it in v4 since we can
delay the cacheline adjustment to dst and count to later instructions.

> But comparing nonzero fill graph[6] with zero fill graph[4],
> why DC ZVA is only effective more than 8MB for __memset_a64fx in spite
> that DC ZVA is effective from smaller size for __memset_generic?

Well it seems on A64FX DC ZVA is faster only when data is not in L1. So it may
be feasible to use DC ZVA for smaller sizes.

Cheers,
Wilco
  

Patch

diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -30,10 +30,8 @@ 
 #define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
 #define CACHE_LINE_SIZE	256
 #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
-#define rest		x8
+#define rest		x2
 #define vector_length	x9
-#define vl_remainder	x10	// vector_length remainder
-#define cl_remainder	x11	// CACHE_LINE_SIZE remainder
 
 #if HAVE_AARCH64_SVE_ASM
 # if IS_IN (libc)
@@ -41,14 +39,6 @@ 
 
 	.arch armv8.2-a+sve
 
-	.macro dc_zva times
-	dc	zva, tmp1
-	add	tmp1, tmp1, CACHE_LINE_SIZE
-	.if \times-1
-	dc_zva "(\times-1)"
-	.endif
-	.endm
-
 	.macro st1b_unroll first=0, last=7
 	st1b	z0.b, p0, [dst, \first, mul vl]
 	.if \last-\first
@@ -187,54 +177,29 @@  L(L1_prefetch): // if rest >= L1_SIZE
 	cbnz	rest, L(unroll32)
 	ret
 
+	// count >= L2_SIZE
 L(L2):
-	// align dst address at vector_length byte boundary
-	sub	tmp1, vector_length, 1
-	ands	tmp2, dst, tmp1
-	// if vl_remainder == 0
-	b.eq	1f
-	sub	vl_remainder, vector_length, tmp2
-	// process remainder until the first vector_length boundary
-	whilelt	p2.b, xzr, vl_remainder
-	st1b	z0.b, p2, [dst]
-	add	dst, dst, vl_remainder
-	sub	rest, rest, vl_remainder
-	// align dstin address at CACHE_LINE_SIZE byte boundary
-1:	mov	tmp1, CACHE_LINE_SIZE
-	ands	tmp2, dst, CACHE_LINE_SIZE - 1
-	// if cl_remainder == 0
-	b.eq	L(L2_dc_zva)
-	sub	cl_remainder, tmp1, tmp2
-	// process remainder until the first CACHE_LINE_SIZE boundary
-	mov	tmp1, xzr       // index
-2:	whilelt	p2.b, tmp1, cl_remainder
-	st1b	z0.b, p2, [dst, tmp1]
-	incb	tmp1
-	cmp	tmp1, cl_remainder
-	b.lo	2b
-	add	dst, dst, cl_remainder
-	sub	rest, rest, cl_remainder
-
-L(L2_dc_zva):
-	// zero fill
-	mov	tmp1, dst
-	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
-	mov	zva_len, ZF_DIST
-	add	tmp1, zva_len, CACHE_LINE_SIZE * 2
-	// unroll
-	.p2align 3
-1:	st1b_unroll 0, 3
-	add	tmp2, dst, zva_len
-	dc	 zva, tmp2
-	st1b_unroll 4, 7
-	add	tmp2, tmp2, CACHE_LINE_SIZE
-	dc	zva, tmp2
-	add	dst, dst, CACHE_LINE_SIZE * 2
-	sub	rest, rest, CACHE_LINE_SIZE * 2
-	cmp	rest, tmp1	// ZF_DIST + CACHE_LINE_SIZE * 2
-	b.ge	1b
-	cbnz	rest, L(unroll8)
-	ret
+	tst	valw, 255
+	b.ne	L(unroll8)
+        // align dst to CACHE_LINE_SIZE byte boundary
+	and	tmp2, dst, CACHE_LINE_SIZE - 1
+	sub	tmp2, tmp2, CACHE_LINE_SIZE
+	st1b	z0.b, p0, [dst, 0, mul vl]
+	st1b	z0.b, p0, [dst, 1, mul vl]
+	st1b	z0.b, p0, [dst, 2, mul vl]
+	st1b	z0.b, p0, [dst, 3, mul vl]
+	sub	dst, dst, tmp2
+	add	count, count, tmp2
+
+	// clear cachelines using DC ZVA
+	sub	count, count, CACHE_LINE_SIZE
+	.p2align 4
+1:	dc	zva, dst
+	add	dst, dst, CACHE_LINE_SIZE
+	subs	count, count, CACHE_LINE_SIZE
+	b.hi	1b
+	add	count, count, CACHE_LINE_SIZE
+	b	L(last)
 
 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)