[v4] aarch64: thunderx2 memcpy optimizations for ext-based code path

Message ID 20190327142209.GA21425@bell-sw.com
State Superseded
Headers

Commit Message

Anton Youdkevitch March 27, 2019, 2:22 p.m. UTC
  Wilco,

Here is the updated patch.

The first three instructions of each chunk are forced
to be unaligned to make the loop header aligned on 16
bytes.


488:   d61f00c0        br      x6
48c:   d503201f        nop
490:   d503201f        nop

494:   6e037840        ext     v0.16b, v2.16b, v3.16b, #15
498:   6e047861        ext     v1.16b, v3.16b, v4.16b, #15
49c:   6e057887        ext     v7.16b, v4.16b, v5.16b, #15

4a0:   ac810460        stp     q0, q1, [x3], #32
4a4:   f9814021        prfm    pldl1strm, [x1, #640]
4a8:   acc10c22        ldp     q2, q3, [x1], #32
4ac:   6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
4b0:   ac814067        stp     q7, q16, [x3], #32
4b4:   6e0278c0        ext     v0.16b, v6.16b, v2.16b, #15
4b8:   6e037841        ext     v1.16b, v2.16b, v3.16b, #15
4bc:   acc11825        ldp     q5, q6, [x1], #32
4c0:   6e057867        ext     v7.16b, v3.16b, v5.16b, #15
4c4:   f1010042        subs    x2, x2, #0x40
4c8:   54fffeca        b.ge    4a0 <__memcpy_thunderx2+0x280>
4cc:   6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
4d0:   140000e1        b       854 <__memcpy_thunderx2+0x634>

4d4:   6e037040        ext     v0.16b, v2.16b, v3.16b, #14
4d8:   6e047061        ext     v1.16b, v3.16b, v4.16b, #14
4dc:   6e057087        ext     v7.16b, v4.16b, v5.16b, #14

4e0:   ac810460        stp     q0, q1, [x3], #32
4e4:   f9814021        prfm    pldl1strm, [x1, #640]
4e8:   acc10c22        ldp     q2, q3, [x1], #32
4ec:   6e0670b0        ext     v16.16b, v5.16b, v6.16b, #14
4f0:   ac814067        stp     q7, q16, [x3], #32
4f4:   6e0270c0        ext     v0.16b, v6.16b, v2.16b, #14
4f8:   6e037041        ext     v1.16b, v2.16b, v3.16b, #14
4fc:   acc11825        ldp     q5, q6, [x1], #32
500:   6e057067        ext     v7.16b, v3.16b, v5.16b, #14
504:   f1010042        subs    x2, x2, #0x40
508:   54fffeca        b.ge    4e0 <__memcpy_thunderx2+0x2c0> 
50c:   6e0670b0        ext     v16.16b, v5.16b, v6.16b, #14
510:   140000d1        b       854 <__memcpy_thunderx2+0x634>

Looks OK?


On Tue, Mar 26, 2019 at 08:40:34PM +0000, Wilco Dijkstra wrote:
> Hi Anton,
> 
> > I appreciate you comments very much. Here is the patch
> > considering the points you made.
> >
> > 1. Always taken conditional branch at the beginning is
> > removed.
> >
> > 2. Epilogue code is placed after the end of the loop to
> > reduce the number of branches.
> >
> > 3. The redundant "mov" instructions inside the loop are
> > gone due to the changed order of the registers in the ext
> > instructions inside the loop.
> >
> > 4. Invariant code in the loop epilogue is no more
> > repeated for each ext chunk.
> 
> That looks much better indeed! The alignment can still be improved
> though:
> 
>    819d0:       6e037840        ext     v0.16b, v2.16b, v3.16b, #15
>    819d4:       6e047861        ext     v1.16b, v3.16b, v4.16b, #15
>    819d8:       6e057887        ext     v7.16b, v4.16b, v5.16b, #15
>    819dc:       ac810460        stp     q0, q1, [x3], #32
>    819e0:       f9814021        prfm    pldl1strm, [x1, #640]
>    819e4:       acc10c22        ldp     q2, q3, [x1], #32
>    819e8:       6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
>    819ec:       ac814067        stp     q7, q16, [x3], #32
>    819f0:       6e0278c0        ext     v0.16b, v6.16b, v2.16b, #15
>    819f4:       6e037841        ext     v1.16b, v2.16b, v3.16b, #15
>    819f8:       acc11825        ldp     q5, q6, [x1], #32
>    819fc:       6e057867        ext     v7.16b, v3.16b, v5.16b, #15
>    81a00:       f1010042        subs    x2, x2, #0x40
>    81a04:       54fffeca        b.ge    819dc <__GI___memcpy_thunderx2+0x27c>
> 
> So rather than aligning the first instruction as currently done:
> 
> #define EXT_CHUNK(shft) \
> .p2align 4 ;\
> 
> Align the loop instead. If you also add 2 nops after the bx instruction then
> everything should work out perfectly.
> 
> Cheers,
> Wilco
>
  

Patch

diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..45e9a29 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -382,7 +382,8 @@  L(bytes_0_to_3):
 	strb    A_lw, [dstin]
 	strb    B_lw, [dstin, tmp1]
 	strb    A_hw, [dstend, -1]
-L(end): ret
+L(end):
+	ret
 
 	.p2align 4
 
@@ -544,43 +545,35 @@  L(dst_unaligned):
 	str     C_q, [dst], #16
 	ldp     F_q, G_q, [src], #32
 	bic	dst, dst, 15
+	subs    count, count, 32
 	adrp	tmp2, L(ext_table)
 	add	tmp2, tmp2, :lo12:L(ext_table)
 	add	tmp2, tmp2, tmp1, LSL #2
 	ldr	tmp3w, [tmp2]
 	add	tmp2, tmp2, tmp3w, SXTW
 	br	tmp2
-
-#define EXT_CHUNK(shft) \
 .p2align 4 ;\
+	nop
+#define EXT_CHUNK(shft) \
 L(ext_size_ ## shft):;\
 	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
-	subs    count, count, 32;\
-	b.ge    2f;\
-1:;\
-	stp     A_q, B_q, [dst], #32;\
 	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #16;\
-	add     dst, dst, tmp1;\
-	str     G_q, [dst], #16;\
-	b       L(copy_long_check32);\
-2:;\
+1:;\
 	stp     A_q, B_q, [dst], #32;\
 	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
-	ldp     D_q, J_q, [src], #32;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+	ldp     C_q, D_q, [src], #32;\
 	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	mov     C_v.16b, G_v.16b;\
 	stp     H_q, I_q, [dst], #32;\
+	ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
+	ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ldp     F_q, G_q, [src], #32;\
-	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
-	ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
-	mov     E_v.16b, J_v.16b;\
+	ext     H_v.16b, D_v.16b, F_v.16b, 16-shft;\
 	subs    count, count, 64;\
-	b.ge    2b;\
-	b	1b;\
+	b.ge    1b;\
+2:;\
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+	b	L(ext_tail);
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)
@@ -598,6 +591,14 @@  EXT_CHUNK(13)
 EXT_CHUNK(14)
 EXT_CHUNK(15)
 
+L(ext_tail):
+	stp     A_q, B_q, [dst], #32
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+
+
 END (MEMCPY)
 	.section	.rodata
 	.p2align	4