aarch64: thunderx2 memcpy implementation cleanup and streamlining

Message ID 20190320143141.GA13393@bell-sw.com
State Superseded
Headers

Commit Message

Anton Youdkevitch March 20, 2019, 2:31 p.m. UTC
  Replaced named labels with the numeric ones to
reduce clutter.
Rewrote branching scheme in load and merge chunk
to be aligned with the most probable case.

ChangeLog:

	* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: cleanup
  

Comments

Patrick McGehearty March 20, 2019, 2:52 p.m. UTC | #1
I'll note that the named labels are helpful
for understanding the intent of the code
while the numeric labels make it more difficult
to find all the sources for a particular branch target.

Bottom line: numeric labels make assembly code
more difficult to comprehend and maintain if
you are not completely familiar with its logic.

I've done considerable work in hand optimization
of assembly language code (multiple architectures)
over the years and generally find numeric labels
to be troublesome, especially for larger routines.


On 3/20/2019 9:31 AM, Anton Youdkevitch wrote:
> Replaced named labels with the numeric ones to
> reduce clutter.
> Rewrote branching scheme in load and merge chunk
> to be aligned with the most probable case.
>
> ChangeLog:
>
> 	* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: cleanup
>
  
Adhemerval Zanella Netto March 20, 2019, 5:46 p.m. UTC | #2
I agree, I see no point in just removing the branches names.
Also, for the branching rewriting you need to show if it has
any performance implications with at least the benchtests results
for memcpy.

On 20/03/2019 11:52, Patrick McGehearty wrote:
> I'll note that the named labels are helpful
> for understanding the intent of the code
> while the numeric labels make it more difficult
> to find all the sources for a particular branch target.
> 
> Bottom line: numeric labels make assembly code
> more difficult to comprehend and maintain if
> you are not completely familiar with its logic.
> 
> I've done considerable work in hand optimization
> of assembly language code (multiple architectures)
> over the years and generally find numeric labels
> to be troublesome, especially for larger routines.
> 
> 
> On 3/20/2019 9:31 AM, Anton Youdkevitch wrote:
>> Replaced named labels with the numeric ones to
>> reduce clutter.
>> Rewrote branching scheme in load and merge chunk
>> to be aligned with the most probable case.
>>
>> ChangeLog:
>>
>>     * sysdeps/aarch64/multiarch/memcpy_thunderx2.S: cleanup
>>
>
  

Patch

diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index b2215c1..b55dd52 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -352,7 +352,7 @@  L(bytes_32_to_48):
 	/* Small copies: 0..16 bytes.  */
 L(memcopy16):
 	cmp     count, 8
-	b.lo    L(bytes_0_to_8)
+	b.lo    1f
 	ldr     A_l, [src]
 	ldr     A_h, [srcend, -8]
 	add     dstend, dstin, count
@@ -361,8 +361,8 @@  L(memcopy16):
 	ret
 	.p2align 4
 
-L(bytes_0_to_8):
-	tbz     count, 2, L(bytes_0_to_3)
+1:
+	tbz     count, 2, 1f
 	ldr     A_lw, [src]
 	ldr     A_hw, [srcend, -4]
 	add     dstend, dstin, count
@@ -372,8 +372,8 @@  L(bytes_0_to_8):
 
 	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
 	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-L(bytes_0_to_3):
-	cbz     count, L(end)
+1:
+	cbz     count, 1f
 	lsr     tmp1, count, 1
 	ldrb    A_lw, [src]
 	ldrb    A_hw, [srcend, -1]
@@ -382,7 +382,8 @@  L(bytes_0_to_3):
 	strb    A_lw, [dstin]
 	strb    B_lw, [dstin, tmp1]
 	strb    A_hw, [dstend, -1]
-L(end): ret
+1:
+	ret
 
 	.p2align 4
 
@@ -404,7 +405,7 @@  L(memcpy_copy96):
 	   after tmp [0..15] gets added to it,
 	   count now is <bytes-left-to-load>+48 */
 	cmp     count, 80
-	b.gt    L(copy96_medium)
+	b.gt    1f
 	ldr     D_q, [src, 32]
 	stp     B_q, C_q, [dst, 16]
 	str     E_q, [dstend, -16]
@@ -412,17 +413,17 @@  L(memcpy_copy96):
 	ret
 
 	.p2align 4
-L(copy96_medium):
+1:
 	ldp     D_q, A_q, [src, 32]
 	str     B_q, [dst, 16]
 	cmp     count, 96
-	b.gt    L(copy96_large)
+	b.gt    1f
 	str     E_q, [dstend, -16]
 	stp     C_q, D_q, [dst, 32]
 	str     A_q, [dst, 64]
 	ret
 
-L(copy96_large):
+1:
 	ldr     F_q, [src, 64]
 	stp     C_q, D_q, [dst, 32]
 	str     E_q, [dstend, -16]
@@ -557,17 +558,9 @@  L(ext_size_ ## shft):;\
 	ext     A_v.16b, C_v.16b, D_v.16b, 16-shft;\
 	ext     B_v.16b, D_v.16b, E_v.16b, 16-shft;\
 	subs    count, count, 32;\
-	b.ge    2f;\
+	b.lt    2f;\
 1:;\
 	stp     A_q, B_q, [dst], #32;\
-	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
-	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
-	stp     H_q, I_q, [dst], #16;\
-	add     dst, dst, tmp1;\
-	str     G_q, [dst], #16;\
-	b       L(copy_long_check32);\
-2:;\
-	stp     A_q, B_q, [dst], #32;\
 	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR];\
 	ldp     D_q, J_q, [src], #32;\
 	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
@@ -579,8 +572,15 @@  L(ext_size_ ## shft):;\
 	ext     B_v.16b, D_v.16b, J_v.16b, 16-shft;\
 	mov     E_v.16b, J_v.16b;\
 	subs    count, count, 64;\
-	b.ge    2b;\
-	b	1b;\
+	b.ge    1b;\
+2:;\
+	stp     A_q, B_q, [dst], #32;\
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-shft;\
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-shft;\
+	stp     H_q, I_q, [dst], #16;\
+	add     dst, dst, tmp1;\
+	str     G_q, [dst], #16;\
+	b       L(copy_long_check32);\
 
 EXT_CHUNK(1)
 EXT_CHUNK(2)