[v3,5/5] AArch64: Improve A64FX memset

Message ID VE1PR08MB5599743405E22528D5F08D7283E49@VE1PR08MB5599.eurprd08.prod.outlook.com
State Superseded
Headers
Series [v3,1/5] AArch64: Improve A64FX memset |

Commit Message

Wilco Dijkstra July 22, 2021, 4:04 p.m. UTC
  Simplify the code for memsets smaller than L1. Improve the unroll8 and L1_prefetch loops.

---
  

Comments

develop--- via Libc-alpha Aug. 3, 2021, 11:22 a.m. UTC | #1
Hi Wilco,

Thank you for the patch.

I confirmed V3 Part 5 performance is better than the master
except 16KB dip [1].
See the comparison graphs between the master and V3 Part 5 [1][2][3].

And the 16KB dip can be fixed, see the below.

Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
Tested-by: Naohiro Tamura <naohirot@fujitsu.com>

[1] https://drive.google.com/file/d/10ujn5LNOqgI2VpynUc1Adt9U777_Ixxz/view?usp=sharing
[2] https://drive.google.com/file/d/14vCq_ng0tFDjo1BRqaMm9m9o3Kntjr0v/view?usp=sharing
[3] https://drive.google.com/file/d/1GBFk8czzJV5hB9sT93qB7Rw7pHQzEfRt/view?usp=sharing

> -----Original Message-----
> From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Sent: Friday, July 23, 2021 1:05 AM
> To: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com>
> Cc: 'GNU C Library' <libc-alpha@sourceware.org>
> Subject: [PATCH v3 5/5] AArch64: Improve A64FX memset

How about like this?
"AArch64: Improve A64FX memset by removing rest variable"

> 
> Simplify the code for memsets smaller than L1. Improve the unroll8 and L1_prefetch loops.
> 
> ---
> diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
> index 8665c272431b46dadea53c63ab74829c3aa99312..36628e101db33a9a8ff5234b98dd5a3a5c9ed73c 100644
> --- a/sysdeps/aarch64/multiarch/memset_a64fx.S
> +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
> @@ -30,7 +30,6 @@
>  #define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
>  #define CACHE_LINE_SIZE	256
>  #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
> -#define rest		x2
>  #define vector_length	x9
> 
>  #if HAVE_AARCH64_SVE_ASM
> @@ -89,29 +88,19 @@ ENTRY (MEMSET)
> 
>  	.p2align 4
>  L(vl_agnostic): // VL Agnostic
> -	mov	rest, count
>  	mov	dst, dstin
> -	add	dstend, dstin, count
> -	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
> -	mov	tmp1, 64
> -	cmp	rest, L2_SIZE
> -	ccmp	vector_length, tmp1, 0, cs
> -	b.eq	L(L2)
> -	// if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
> -	cmp	rest, L1_SIZE
> -	ccmp	vector_length, tmp1, 0, cs
> -	b.eq	L(L1_prefetch)
> -
> +	cmp	count, L1_SIZE
> +	b.hi	L(L1_prefetch)
> 
> +	// count >= 8 * vector_length
>  L(unroll8):
> -	lsl	tmp1, vector_length, 3
> -	.p2align 3
> -1:	cmp	rest, tmp1
> -	b.cc	L(last)
> -	st1b_unroll
> +	sub	count, count, tmp1
> +	.p2align 4
> +1:	subs	count, count, tmp1
> +	st1b_unroll 0, 7
>  	add	dst, dst, tmp1
> -	sub	rest, rest, tmp1
> -	b	1b
> +	b.hi	1b
> +	add	count, count, tmp1
> 

Reverting unroll8 logic to V3 Part 4 fixed 16KB dip [4].
See the comparison graphs  between the master and V3 Part 5 fixed [4][5][6].

L(unroll8):
        lsl     tmp1, vector_length, 3
        .p2align 3
1:      cmp     count, tmp1
        b.cc    L(last)
        st1b_unroll
        add     dst, dst, tmp1
        sub     count, count, tmp1
        b       1b

[4] https://drive.google.com/file/d/1NfaEF24ud8JOpCktlzoeQ5VvyJc593lD/view?usp=sharing
[5] https://drive.google.com/file/d/1DfwPenANTwgLm2kqu_w9QugmYNqysOMF/view?usp=sharing
[6] https://drive.google.com/file/d/1OL6_gbdevwJmfEeRbEANJ4pVdqaSRZvV/view?usp=sharing

Thanks.
Naohiro

>  L(last):
>  	cmp	count, vector_length, lsl 1
> @@ -129,18 +118,22 @@ L(last):
>  	st1b	z0.b, p0, [dstend, -1, mul vl]
>  	ret
> 
> -L(L1_prefetch): // if rest >= L1_SIZE
> +	// count >= L1_SIZE
>  	.p2align 3
> +L(L1_prefetch):
> +	cmp	count, L2_SIZE
> +	b.hs	L(L2)
> +	cmp	vector_length, 64
> +	b.ne	L(unroll8)
>  1:	st1b_unroll 0, 3
>  	prfm	pstl1keep, [dst, PF_DIST_L1]
>  	st1b_unroll 4, 7
>  	prfm	pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
>  	add	dst, dst, CACHE_LINE_SIZE * 2
> -	sub	rest, rest, CACHE_LINE_SIZE * 2
> -	cmp	rest, L1_SIZE
> -	b.ge	1b
> -	cbnz	rest, L(unroll8)
> -	ret
> +	sub	count, count, CACHE_LINE_SIZE * 2
> +	cmp	count, PF_DIST_L1
> +	b.hs	1b
> +	b	L(unroll8)
> 
>  	// count >= L2_SIZE
>  L(L2):
  
Wilco Dijkstra Aug. 9, 2021, 2:52 p.m. UTC | #2
Hi Naohiro,

> Reverting unroll8 logic to V3 Part 4 fixed 16KB dip [4].
> See the comparison graphs  between the master and V3 Part 5 fixed [4][5][6].

I don't see an improvement from the old unroll8 loop - there is about 2%
benefit on 16KB, but all other sizes become slower. At size 1K it is 50%
slower... I tried some other variations and moving the SUBS to the end
of the loop appears slightly better overall, so I've done that for the v4 patch.

Cheers,
Wilco
  
develop--- via Libc-alpha Aug. 17, 2021, 6:40 a.m. UTC | #3
Hi Wilco,

Thank you for the comment and V4 Patch.

> From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Sent: Monday, 9 August 2021 23:52
> 
> > Reverting unroll8 logic to V3 Part 4 fixed 16KB dip [4].
> > See the comparison graphs  between the master and V3 Part 5 fixed [4][5][6].
> 
> I don't see an improvement from the old unroll8 loop - there is about 2%
> benefit on 16KB, but all other sizes become slower. At size 1K it is 50%
> slower... I tried some other variations and moving the SUBS to the end
> of the loop appears slightly better overall, so I've done that for the v4 patch.

If our results are different each other, that must be due to test
environment difference.

In my test environment, Patch V4 has a little bit lager 16KB dip than
Patch V3 [1].

So let me ask you the same question I asked in the mail [2] before.

How did you measure the performance? Did you use some kind of simulator?

Or do you have real A64FX environment?

[1] https://drive.google.com/file/d/1NajofMts3aPHwlvuXYXt48kyGPXPcJfk/view?usp=sharing
[2] https://sourceware.org/pipermail/libc-alpha/2021-July/128313.html

Thanks.
Naohiro
  
Wilco Dijkstra Aug. 19, 2021, 1:06 p.m. UTC | #4
Hi Naohiro,

> In my test environment, Patch V4 has a little bit lager 16KB dip than
> Patch V3 [1].

That's odd since nothing significant changed in the loop. The performance 
variations don't make much sense to me, it seems A64FX is very sensitive
to something, but it's not clear what exactly it is.

Note that I didn't see anything similar for 16KB in bench-memset-large or
bench-memset-walk. However if you'd like to get to the bottom of this, one
way would be to write a small benchmark that reproduces the dip (which is
likely non-trivial) and use perf stat to get performance counter results for the
different loops.

> So let me ask you the same question I asked in the mail [2] before.
> 
> How did you measure the performance? Did you use some kind of simulator?
> 
> Or do you have real A64FX environment?

Yes, I just ran it on an A64FX machine.

Cheers,
Wilco
  
develop--- via Libc-alpha Aug. 20, 2021, 5:41 a.m. UTC | #5
Hi Wilco,

> > In my test environment, Patch V4 has a little bit lager 16KB dip than
> > Patch V3 [1].
> 
> That's odd since nothing significant changed in the loop. The performance
> variations don't make much sense to me, it seems A64FX is very sensitive
> to something, but it's not clear what exactly it is.
> 

I feel it's odd too, but it's reality. The 16KB dip is reproducible in all of three
A64FX environments I have, FX1000 master node, FX1000 compute node, and FX700.

> Note that I didn't see anything similar for 16KB in bench-memset-large or
> bench-memset-walk. However if you'd like to get to the bottom of this, one
> way would be to write a small benchmark that reproduces the dip (which is
> likely non-trivial) and use perf stat to get performance counter results for the
> different loops.

I didn't see anything similar for 16KB in bench-memset-large or bench-memset-walk too. 

As I presented V3 patch fixed, reverting unroll8 code solved the 16KB dip.
The following two graphs show for V4 patch fixed that unroll8 code is reverted.
The first graph [1] shows comparison the master with V4 fixed.
The second graph [2] shows comparison V4 with V4 fixed.

[1] https://drive.google.com/file/d/19og4ZhU9itzFAVXX8TIzlpgiiukiXQbp/view?usp=sharing
[2] https://drive.google.com/file/d/1wQgPU6GyRQ_Z8ibsGja-NfdKhN5bz7I9/view?usp=sharing 

In my environment, I don't have any performance degradation by reverting unroll8,
but 16KB performance improvement as shown in the graphs.

In your environment, do you have any performance degradation by reverting unroll8?
If there is no disadvantage by reverting unroll8, why don't we revert it?

> > So let me ask you the same question I asked in the mail [2] before.
> >
> > How did you measure the performance? Did you use some kind of simulator?
> >
> > Or do you have real A64FX environment?
> 
> Yes, I just ran it on an A64FX machine.

Is it HPE Apollo 80 System?
Or does ARM Company have an account to Fujitsu FX1000 or FX700?

Thanks.
Naohiro
  
Wilco Dijkstra Aug. 23, 2021, 4:50 p.m. UTC | #6
Hi Naohiro,

> In my environment, I don't have any performance degradation by reverting unroll8,
> but 16KB performance improvement as shown in the graphs.

I still see a major regression at 1KB in the graph (it is larger relatively than the gain at 16KB),
plus many smaller regressions between 2KB-8KB.

> In your environment, do you have any performance degradation by reverting unroll8?
> If there is no disadvantage by reverting unroll8, why don't we revert it?

For me bench-memset shows a 50% regression with the unroll8 loop reverted plus
many smaller regressions. So I don't think reverting is a good idea.

I tried "perf stat" and oddly enough this loop causes a lot of branch mispredictions.
However if you add a branch at the top of the loop that is never taken (eg. blt and
ensuring the sub above it sets the flags), it becomes faster than the best results so far.
If you can reproduce that, it is probably the best workaround.

> Is it HPE Apollo 80 System?
> Or does ARM Company have an account to Fujitsu FX1000 or FX700?

It has 48 cores, that's all I know...

Cheers,
Wilco
  
develop--- via Libc-alpha Aug. 24, 2021, 7:56 a.m. UTC | #7
Hi Wilco,

> > In my environment, I don't have any performance degradation by reverting unroll8,
> > but 16KB performance improvement as shown in the graphs.
> 
> I still see a major regression at 1KB in the graph (it is larger relatively than the gain at 16KB),
> plus many smaller regressions between 2KB-8KB.

Are you talking about the regression between V4 and V4 fixed?
If so, that is also observed in my environment as shown in the graph [2].
But V4 fixed is not degraded than the master as shown in the graph [1].

I think we are getting almost same result each other, but not exactly same, right?

> > The first graph [1] shows comparison the master with V4 fixed.
> > The second graph [2] shows comparison V4 with V4 fixed.
> > 
> > [1] https://drive.google.com/file/d/19og4ZhU9itzFAVXX8TIzlpgiiukiXQbp/view?usp=sharing
> > [2] https://drive.google.com/file/d/1wQgPU6GyRQ_Z8ibsGja-NfdKhN5bz7I9/view?usp=sharing 
 
> > In your environment, do you have any performance degradation by reverting unroll8?
> > If there is no disadvantage by reverting unroll8, why don't we revert it?
> 
> For me bench-memset shows a 50% regression with the unroll8 loop reverted plus
> many smaller regressions. So I don't think reverting is a good idea.

If the 50% regression in your environment is at 1KB, the regression at 1KB happens
in my environment too as shown in the graph [4], but the rate seems less than 50%.

Both your result and my result are true and real.
I don't think it's rational to make decision by looking at only one environment result.

As I explained at the bottom of this mail, V4 code is tuned to Applo 80 and FX700.
So we need to take FX1000 into account too.

> I tried "perf stat" and oddly enough this loop causes a lot of branch mispredictions.
> However if you add a branch at the top of the loop that is never taken (eg. blt and
> ensuring the sub above it sets the flags), it becomes faster than the best results so far.
> If you can reproduce that, it is probably the best workaround.

Does "it becomes faster than the best results so far" mean faster than the master?
I think we should put the baseline or bottom line to the master performance.
If the workaround is not faster than or equal to the master at 16KB which has the peak
performance, reverting unroll8 is preferable. 

I'm not sure if I understood what the workaround code looks like, is it like this?

L(unroll8):
        sub     count, count, tmp1
        .p2align 4
1:      subs    tmp2, xzr, xzr
        b.lt    1b
        st1b_unroll 0, 7
        add     dst, dst, tmp1
        subs    count, count, tmp1
        b.hi    1b
        add     count, count, tmp1

> > Is it HPE Apollo 80 System?
> > Or does ARM Company have an account to Fujitsu FX1000 or FX700?
> 
> It has 48 cores, that's all I know...

I think your environment must be Applo 80 or FX700 which has 48 cores and 4 NUMA nodes.
FX1000 master node has 52 cores and FX1000 compute node has 50 cores.
OS sees FX1000 as if it has 8 NUMA nodes.

Thanks.
Naohiro
  
develop--- via Libc-alpha Aug. 24, 2021, 8:07 a.m. UTC | #8
Fixed a typo inline

> -----Original Message-----
> From: Libc-alpha <libc-alpha-bounces+naohirot=fujitsu.com@sourceware.org> On Behalf Of naohirot--- via Libc-alpha
> Sent: Tuesday, August 24, 2021 4:56 PM
> To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Cc: 'GNU C Library' <libc-alpha@sourceware.org>
> Subject: RE: [PATCH v3 5/5] AArch64: Improve A64FX memset
> 
> Hi Wilco,
> 
> > > In my environment, I don't have any performance degradation by reverting unroll8,
> > > but 16KB performance improvement as shown in the graphs.
> >
> > I still see a major regression at 1KB in the graph (it is larger relatively than the gain at 16KB),
> > plus many smaller regressions between 2KB-8KB.
> 
> Are you talking about the regression between V4 and V4 fixed?
> If so, that is also observed in my environment as shown in the graph [2].
> But V4 fixed is not degraded than the master as shown in the graph [1].
> 
> I think we are getting almost same result each other, but not exactly same, right?
> 
> > > The first graph [1] shows comparison the master with V4 fixed.
> > > The second graph [2] shows comparison V4 with V4 fixed.
> > >
> > > [1] https://drive.google.com/file/d/19og4ZhU9itzFAVXX8TIzlpgiiukiXQbp/view?usp=sharing
> > > [2] https://drive.google.com/file/d/1wQgPU6GyRQ_Z8ibsGja-NfdKhN5bz7I9/view?usp=sharing
> 
> > > In your environment, do you have any performance degradation by reverting unroll8?
> > > If there is no disadvantage by reverting unroll8, why don't we revert it?
> >
> > For me bench-memset shows a 50% regression with the unroll8 loop reverted plus
> > many smaller regressions. So I don't think reverting is a good idea.
> 
> If the 50% regression in your environment is at 1KB, the regression at 1KB happens
> in my environment too as shown in the graph [4], but the rate seems less than 50%.
> 
"the graph [4]" should be "the graph [2]".

Thanks.
Naohiro

> Both your result and my result are true and real.
> I don't think it's rational to make decision by looking at only one environment result.
> 
> As I explained at the bottom of this mail, V4 code is tuned to Applo 80 and FX700.
> So we need to take FX1000 into account too.
> 
> > I tried "perf stat" and oddly enough this loop causes a lot of branch mispredictions.
> > However if you add a branch at the top of the loop that is never taken (eg. blt and
> > ensuring the sub above it sets the flags), it becomes faster than the best results so far.
> > If you can reproduce that, it is probably the best workaround.
> 
> Does "it becomes faster than the best results so far" mean faster than the master?
> I think we should put the baseline or bottom line to the master performance.
> If the workaround is not faster than or equal to the master at 16KB which has the peak
> performance, reverting unroll8 is preferable.
> 
> I'm not sure if I understood what the workaround code looks like, is it like this?
> 
> L(unroll8):
>         sub     count, count, tmp1
>         .p2align 4
> 1:      subs    tmp2, xzr, xzr
>         b.lt    1b
>         st1b_unroll 0, 7
>         add     dst, dst, tmp1
>         subs    count, count, tmp1
>         b.hi    1b
>         add     count, count, tmp1
> 
> > > Is it HPE Apollo 80 System?
> > > Or does ARM Company have an account to Fujitsu FX1000 or FX700?
> >
> > It has 48 cores, that's all I know...
> 
> I think your environment must be Applo 80 or FX700 which has 48 cores and 4 NUMA nodes.
> FX1000 master node has 52 cores and FX1000 compute node has 50 cores.
> OS sees FX1000 as if it has 8 NUMA nodes.
> 
> Thanks.
> Naohiro
  
Wilco Dijkstra Aug. 24, 2021, 3:46 p.m. UTC | #9
Hi Naohiro,

> Are you talking about the regression between V4 and V4 fixed?
> If so, that is also observed in my environment as shown in the graph [2].

I was talking about your graph [2] - my results are below.

> But V4 fixed is not degraded than the master as shown in the graph [1].

That may be true but it has lost a lot of the performance gains of V4 just to improve
the 16KB datapoint in one benchmark. I don't believe that is a good tradeoff.

> I think we are getting almost same result each other, but not exactly same, right?
>
> If the 50% regression in your environment is at 1KB, the regression at 1KB happens
> in my environment too as shown in the graph [4], but the rate seems less than 50%.

Yes the differences are at similar sizes but with different magnitude.

> Both your result and my result are true and real.
> I don't think it's rational to make decision by looking at only one environment result.

It's odd the behaviour with the same CPU isn't identical. If there is a way to make them
behave more similarly, I would love to hear it! In any case it would be good to know
how the blt workaround works on your system.

> Does "it becomes faster than the best results so far" mean faster than the master?

With best result I mean fastest of V4 and V4 with unroll8.

These are the results I get for bench-memset compared to V4 (higher = faster):

       v4+blt  v4+unroll8
0-512  0.01%   0.00%
1K-4K -0.15%  -3.07%
4K-8K  0.11%  -0.04%
16K    3.56%   1.98%
32K    0.74%  -0.71%
64K    1.91%   0.53%
128K   0.23%   0.10%

So the blt workaround improves performance of larger sizes far more than unroll8,
and most importantly, it doesn't regress smaller sizes like unroll8.

> I think we should put the baseline or bottom line to the master performance.
> If the workaround is not faster than or equal to the master at 16KB which has the peak
> performance, reverting unroll8 is preferable. 

A new implementation does not need to beat a previous version on every single size.
It would be impossibly hard to achieve that - an endless game of whack-a-mole...
So I always look for better performance overall and for commonly used size ranges
(see above table, V4+blt is 0.6% faster overall than V4+unroll8).

We should avoid major regressions of course, so the question is whether we can tweak V4
a little so that it does better around 16KB without losing any of its performance gains.
​My results show that is possible with the blt workaround, but not with the unroll8 loop.

> I'm not sure if I understood what the workaround code looks like, is it like this?

It just injects a single blt at the top of the loop and changes the sub before the
loop to subs, so you get something like this:

        subs    count, count, tmp1
        .p2align 4
1:      b.lt    last

I can propose a patch for this workaround if it isn't clear.

> I think your environment must be Applo 80 or FX700 which has 48 cores and 4 NUMA nodes.
> FX1000 master node has 52 cores and FX1000 compute node has 50 cores.
> OS sees FX1000 as if it has 8 NUMA nodes.

I do see 4 NUMA nodes indeed, but performance isn't affected at all by which node you select
(at least on bench-memset since it runs from L1/L2).

Cheers,
Wilco
  
develop--- via Libc-alpha Aug. 26, 2021, 1:44 a.m. UTC | #10
Hi Wilco,

> It's odd the behaviour with the same CPU isn't identical. If there is a way to make them
> behave more similarly, I would love to hear it! In any case it would be good to know
> how the blt workaround works on your system.

You can see the difference between FX1000 and FX700 (==Apollo 80) [1].
The number of cores and clock are different at least.

And the blt workaround worked for only FX700 but not for FX1000 as explained below.

[1] https://www.fujitsu.com/global/products/computing/servers/supercomputer/specifications/

> > I'm not sure if I understood what the workaround code looks like, is it like this?
> 
> It just injects a single blt at the top of the loop and changes the sub before the
> loop to subs, so you get something like this:
> 
>         subs    count, count, tmp1
>         .p2align 4
> 1:      b.lt    last
> 
> I can propose a patch for this workaround if it isn't clear.

If you agree to the cmp and branch workaround (2 instructions at the beginning of the loop)
below, I'll submit a patch.

1) Result of the blt workaround (1 instruction at the beginning of the loop)

I tried two patterns,

        subs    count, count, tmp1
        .p2align 4
1:      b.lt    L(last)

and

        sub     count, count, tmp1
        .p2align 4
1:      cbnz    xzr, L(last)

Both patterns worked for only FX700, but not FX1000.

FX700 master vs v4fix 1 instruction [2]
FX700 v4 vs v4fix 1 instruction [3]
FX1000 master vs v4fix 1 instruction [4]
FX1000 v4 vs v4fix 1 instruction [5]

[2] https://drive.google.com/file/d/1IBsPYg2ia2t1YyMmaYVb7tFG89njO2aq/view?usp=sharing
[3] https://drive.google.com/file/d/1q44gqOWZvFhzKAe2di5y8EQrRwWkgxoU/view?usp=sharing
[4] https://drive.google.com/file/d/1P10oD0-WO8J5t7QiP7QwgqOqlAZ2I5hn/view?usp=sharing
[5] https://drive.google.com/file/d/1wKv-bPx20LgJyWl761gXiKLsPwhukEzx/view?usp=sharing

2) Result of the cmp and branch workaround (2 instructions at the beginning of the loop)

I tried two patterns,

        sub     count, count, tmp1
        .p2align 4
1:      subs    tmp2, xzr, xzr
        b.lt    1b

and

        sub     count, count, tmp1
        .p2align 4
1:      cmp     xzr, xzr
        b.ne    1b

Both patterns worked for FX700 and FX1000.

FX700 master vs v4fix 2 instructions [6]
FX700 v4 vs v4fix 2 instructions [7]
FX1000 master vs v4fix 2 instructions [8]
FX1000 v4 vs v4fix 2 instructions [9]

[6] https://drive.google.com/file/d/1B-CsRGT1rJFQCMHja78DEflQ-JHxSkGf/view?usp=sharing
[7] https://drive.google.com/file/d/1KCriikc1jIKEKLFoaTV0jYTqhtvmbblh/view?usp=sharing
[8] https://drive.google.com/file/d/1sunelmZ30jpd_aeWKXu65XNkS9X_akWb/view?usp=sharing
[9] https://drive.google.com/file/d/1JaJG0I79VMSTGy2PqaZf1SILujE69Gi2/view?usp=sharing

Thanks.
Naohiro
  
Wilco Dijkstra Aug. 26, 2021, 2:13 p.m. UTC | #11
Hi Naohiro,

> You can see the difference between FX1000 and FX700 (==Apollo 80) [1].
> The number of cores and clock are different at least.
>
> And the blt workaround worked for only FX700 but not for FX1000 as explained below.
> 
> [1] https://www.fujitsu.com/global/products/computing/servers/supercomputer/specifications/

So it looks like they are different silicon and likely slightly different microarchitectures
which would explain the different behaviour.

> If you agree to the cmp and branch workaround (2 instructions at the beginning of the loop)
> below, I'll submit a patch.

Yes, the 2 instruction workaround is clearly the best solution so far. It fixes the dips
around 16KB but doesn't regress anything else. The results v4 vs v4fix [9] show there
are even some uplifts in the 1-8KB range.

[9] https://drive.google.com/file/d/1JaJG0I79VMSTGy2PqaZf1SILujE69Gi2/view?usp=sharing

> 2) Result of the cmp and branch workaround (2 instructions at the beginning of the loop)

It's interesting this works on both systems, however it's still a mystery why...
It would be a good idea to ask your CPU team about this.

Cheers,
Wilco
  
develop--- via Libc-alpha Aug. 27, 2021, 5:05 a.m. UTC | #12
Hi Wilco,

> > If you agree to the cmp and branch workaround (2 instructions at the beginning of the loop)
> > below, I'll submit a patch.
> 
> Yes, the 2 instruction workaround is clearly the best solution so far. It fixes the dips
> around 16KB but doesn't regress anything else. The results v4 vs v4fix [9] show there
> are even some uplifts in the 1-8KB range.

Thank you for the review. I submitted a patch [1], please find it.

[1] https://sourceware.org/pipermail/libc-alpha/2021-August/130569.html

> > 2) Result of the cmp and branch workaround (2 instructions at the beginning of the loop)
> 
> It's interesting this works on both systems, however it's still a mystery why...
> It would be a good idea to ask your CPU team about this.

OK. In the meanwhile you can find the microarchitecture manual [2] if you're interested in.

[2] https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Microarchitecture_Manual_en_1.5.pdf

Thanks.
Naohiro
  

Patch

diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index 8665c272431b46dadea53c63ab74829c3aa99312..36628e101db33a9a8ff5234b98dd5a3a5c9ed73c 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -30,7 +30,6 @@ 
 #define L2_SIZE         (8*1024*1024)	// L2 8MB - 1MB
 #define CACHE_LINE_SIZE	256
 #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
-#define rest		x2
 #define vector_length	x9
 
 #if HAVE_AARCH64_SVE_ASM
@@ -89,29 +88,19 @@  ENTRY (MEMSET)
 
 	.p2align 4
 L(vl_agnostic): // VL Agnostic
-	mov	rest, count
 	mov	dst, dstin
-	add	dstend, dstin, count
-	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
-	mov	tmp1, 64
-	cmp	rest, L2_SIZE
-	ccmp	vector_length, tmp1, 0, cs
-	b.eq	L(L2)
-	// if rest >= L1_SIZE && vector_length == 64 then L(L1_prefetch)
-	cmp	rest, L1_SIZE
-	ccmp	vector_length, tmp1, 0, cs
-	b.eq	L(L1_prefetch)
-
+	cmp	count, L1_SIZE
+	b.hi	L(L1_prefetch)
 
+	// count >= 8 * vector_length
 L(unroll8):
-	lsl	tmp1, vector_length, 3
-	.p2align 3
-1:	cmp	rest, tmp1
-	b.cc	L(last)
-	st1b_unroll
+	sub	count, count, tmp1
+	.p2align 4
+1:	subs	count, count, tmp1
+	st1b_unroll 0, 7
 	add	dst, dst, tmp1
-	sub	rest, rest, tmp1
-	b	1b
+	b.hi	1b
+	add	count, count, tmp1
 
 L(last):
 	cmp	count, vector_length, lsl 1
@@ -129,18 +118,22 @@  L(last):
 	st1b	z0.b, p0, [dstend, -1, mul vl]
 	ret
 
-L(L1_prefetch): // if rest >= L1_SIZE
+	// count >= L1_SIZE
 	.p2align 3
+L(L1_prefetch):
+	cmp	count, L2_SIZE
+	b.hs	L(L2)
+	cmp	vector_length, 64
+	b.ne	L(unroll8)
 1:	st1b_unroll 0, 3
 	prfm	pstl1keep, [dst, PF_DIST_L1]
 	st1b_unroll 4, 7
 	prfm	pstl1keep, [dst, PF_DIST_L1 + CACHE_LINE_SIZE]
 	add	dst, dst, CACHE_LINE_SIZE * 2
-	sub	rest, rest, CACHE_LINE_SIZE * 2
-	cmp	rest, L1_SIZE
-	b.ge	1b
-	cbnz	rest, L(unroll8)
-	ret
+	sub	count, count, CACHE_LINE_SIZE * 2
+	cmp	count, PF_DIST_L1
+	b.hs	1b
+	b	L(unroll8)
 
 	// count >= L2_SIZE
 L(L2):