From patchwork Wed Mar 27 14:22:12 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
X-Patchwork-Id: 32008
Received: (qmail 65990 invoked by alias); 27 Mar 2019 14:22:33 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 65973 invoked by uid 89); 27 Mar 2019 14:22:32 -0000
Authentication-Results: sourceware.org; auth=none
X-Spam-SWARE-Status: No, score=-27.6 required=5.0 tests=BAYES_00, GIT_PATCH_0,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_LOW,
	SPF_PASS autolearn=ham version=3.3.1 spammy=
X-HELO: forward105j.mail.yandex.net
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=bell-sw.com; s=mail;
	t=1553696541;
	bh=pOAFaaOVVNmsdBv+sO9L4OP04rWXB3YCRTv2ijjpSgA=;
	h=In-Reply-To:Subject:Cc:To:From:References:Date:Message-ID;
	b=Ji8ePX7x58xibuXVlQxxSZditf3m5NRR9msd/HwFJLIq7+cDAbyt+tXxofg+azx5V
	M4b+8/njmdhPbNgSW86/D2TqOjxRJMwBEubdPv+c4HcYu7cedRFfY4Z0jCy9dkB8w7
	FShFbQawah1N4oCG/Gh6WY+kWodCi7POwQ9x+XuM=
Authentication-Results: mxback16o.mail.yandex.net;
	dkim=pass header.i=@bell-sw.com
Date: Wed, 27 Mar 2019 17:22:12 +0300
From: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: libc-alpha@sourceware.org
Subject: Re: [PATCH v4] aarch64: thunderx2 memcpy optimizations for ext-based
	code path
Message-ID: <20190327142209.GA21425@bell-sw.com>
References: <5C994D96.6000303@bell-sw.com>
	<AM6PR08MB50784754E2A2B354E83D159D835F0@AM6PR08MB5078.eurprd08.prod.outlook.com>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: 
 <AM6PR08MB50784754E2A2B354E83D159D835F0@AM6PR08MB5078.eurprd08.prod.outlook.com>
User-Agent: Mutt/1.5.24 (2015-08-30)

Wilco,

Here is the updated patch.

The first three instructions of each chunk are forced
to be unaligned to make the loop header aligned on 16
bytes.


488:   d61f00c0        br      x6
48c:   d503201f        nop
490:   d503201f        nop

494:   6e037840        ext     v0.16b, v2.16b, v3.16b, #15
498:   6e047861        ext     v1.16b, v3.16b, v4.16b, #15
49c:   6e057887        ext     v7.16b, v4.16b, v5.16b, #15

4a0:   ac810460        stp     q0, q1, [x3], #32
4a4:   f9814021        prfm    pldl1strm, [x1, #640]
4a8:   acc10c22        ldp     q2, q3, [x1], #32
4ac:   6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
4b0:   ac814067        stp     q7, q16, [x3], #32
4b4:   6e0278c0        ext     v0.16b, v6.16b, v2.16b, #15
4b8:   6e037841        ext     v1.16b, v2.16b, v3.16b, #15
4bc:   acc11825        ldp     q5, q6, [x1], #32
4c0:   6e057867        ext     v7.16b, v3.16b, v5.16b, #15
4c4:   f1010042        subs    x2, x2, #0x40
4c8:   54fffeca        b.ge    4a0 <__memcpy_thunderx2+0x280>
4cc:   6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
4d0:   140000e1        b       854 <__memcpy_thunderx2+0x634>

4d4:   6e037040        ext     v0.16b, v2.16b, v3.16b, #14
4d8:   6e047061        ext     v1.16b, v3.16b, v4.16b, #14
4dc:   6e057087        ext     v7.16b, v4.16b, v5.16b, #14

4e0:   ac810460        stp     q0, q1, [x3], #32
4e4:   f9814021        prfm    pldl1strm, [x1, #640]
4e8:   acc10c22        ldp     q2, q3, [x1], #32
4ec:   6e0670b0        ext     v16.16b, v5.16b, v6.16b, #14
4f0:   ac814067        stp     q7, q16, [x3], #32
4f4:   6e0270c0        ext     v0.16b, v6.16b, v2.16b, #14
4f8:   6e037041        ext     v1.16b, v2.16b, v3.16b, #14
4fc:   acc11825        ldp     q5, q6, [x1], #32
500:   6e057067        ext     v7.16b, v3.16b, v5.16b, #14
504:   f1010042        subs    x2, x2, #0x40
508:   54fffeca        b.ge    4e0 <__memcpy_thunderx2+0x2c0> 
50c:   6e0670b0        ext     v16.16b, v5.16b, v6.16b, #14
510:   140000d1        b       854 <__memcpy_thunderx2+0x634>

Looks OK?


On Tue, Mar 26, 2019 at 08:40:34PM +0000, Wilco Dijkstra wrote:
> Hi Anton,
> 
> > I appreciate you comments very much. Here is the patch
> > considering the points you made.
> >
> > 1. Always taken conditional branch at the beginning is
> > removed.
> >
> > 2. Epilogue code is placed after the end of the loop to
> > reduce the number of branches.
> >
> > 3. The redundant "mov" instructions inside the loop are
> > gone due to the changed order of the registers in the ext
> > instructions inside the loop.
> >
> > 4. Invariant code in the loop epilogue is no more
> > repeated for each ext chunk.
> 
> That looks much better indeed! The alignment can still be improved
> though:
> 
>    819d0:       6e037840        ext     v0.16b, v2.16b, v3.16b, #15
>    819d4:       6e047861        ext     v1.16b, v3.16b, v4.16b, #15
>    819d8:       6e057887        ext     v7.16b, v4.16b, v5.16b, #15
>    819dc:       ac810460        stp     q0, q1, [x3], #32
>    819e0:       f9814021        prfm    pldl1strm, [x1, #640]
>    819e4:       acc10c22        ldp     q2, q3, [x1], #32
>    819e8:       6e0678b0        ext     v16.16b, v5.16b, v6.16b, #15
>    819ec:       ac814067        stp     q7, q16, [x3], #32
>    819f0:       6e0278c0        ext     v0.16b, v6.16b, v2.16b, #15
>    819f4:       6e037841        ext     v1.16b, v2.16b, v3.16b, #15
>    819f8:       acc11825        ldp     q5, q6, [x1], #32
>    819fc:       6e057867        ext     v7.16b, v3.16b, v5.16b, #15
>    81a00:       f1010042        subs    x2, x2, #0x40
>    81a04:       54fffeca        b.ge    819dc <__GI___memcpy_thunderx2+0x27c>
> 
> So rather than aligning the first instruction as currently done:
> 
> #define EXT_CHUNK(shft) \
> .p2align 4 ;\
> 
> Align the loop instead. If you also add 2 nops after the bx instruction then
> everything should work out perfectly.
> 
> Cheers,
> Wilco
>

diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..45e9a29 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -382,7 +382,8 @@ L(bytes_0_to_3):
 	strb    A_lw, [dstin]
 	strb    B_lw, [dstin, tmp1]
 	strb    A_hw, [dstend, -1]
-L(end): ret
+L(end):
+	ret
 
 	.p2align 4
 
@@ -544,43 +545,35 @@ L(dst_unaligned):
 	str     C_q, [dst], #16
 	ldp     F_q, G_q, [src], #32
 	bic	dst, dst, 15
+	subs    count, count, 32
 	adrp	tmp2, L(ext_table)
 	add	tmp2, tmp2, :lo12:L(ext_table)
 	add	tmp2, tmp2, tmp1, LSL #2
 	ldr	tmp3w, [tmp2]
 	add	tmp2, tmp2, tmp3w, SXTW
 	br	tmp2
-
-#define EXT_CHUNK(shft) \
 .p2align 4 ;\
+	nop
+#define EXT_CHUNK(shft) \
 L(ext_size_ ## shft):;\
 	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
-	subs    count, count, 32;\
-	b.ge    2f;\
-1:;\
-	stp     A_q, B_q, [dst], #32;\
 	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #16;\
-	add     dst, dst, tmp1;\
-	str     G_q, [dst], #16;\
-	b       L(copy_long_check32);\
-2:;\
+1:;\
 	stp     A_q, B_q, [dst], #32;\
 	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
-	ldp     D_q, J_q, [src], #32;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+	ldp     C_q, D_q, [src], #32;\
 	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	mov     C_v.16b, G_v.16b;\
 	stp     H_q, I_q, [dst], #32;\
+	ext     A_v.16b, G_v.16b, C_v.16b, 16-shft;\
+	ext     B_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ldp     F_q, G_q, [src], #32;\
-	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
-	ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
-	mov     E_v.16b, J_v.16b;\
+	ext     H_v.16b, D_v.16b, F_v.16b, 16-shft;\
 	subs    count, count, 64;\
-	b.ge    2b;\
-	b	1b;\
+	b.ge    1b;\
+2:;\
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+	b	L(ext_tail);
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)
@@ -598,6 +591,14 @@ EXT_CHUNK(13)
 EXT_CHUNK(14)
 EXT_CHUNK(15)
 
+L(ext_tail):
+	stp     A_q, B_q, [dst], #32
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+
+
 END (MEMCPY)
 	.section	.rodata
 	.p2align	4