Add ifunc memcpy and memmove for aarch64

Message ID c0e44c88-244d-e327-de4b-02470c3a78cf@linaro.org
State Dropped
Headers

Commit Message

Adhemerval Zanella Netto Feb. 7, 2017, 1:22 p.m. UTC
  On 07/02/2017 11:01, Siddhesh Poyarekar wrote:
> On Tuesday 07 February 2017 06:12 PM, Wilco Dijkstra wrote:
>> I agree we want to avoid using conditional compilation as much as possible.
>> On the other hand duplication is a bad idea too, I've seen too many cases where
>> bugs were only fixed in one of the N duplicates.
> 
> Sure, but then in that case the de-duplication must be done by
> identifying a logical code block and make that into a macro to override
> and not just arbitrarily inject hunks of code.  So in this case it could
> be alternate implementations of copy_long that is sufficient so #define
> COPY_LONG in both memcpy_generic and memcpy_thunderx and have the parent
> (memcpy.S) use that macro.  In fact, that might even end up making the
> code a bit nicer to read.
> 
>> However I'm actually wondering whether we need an ifunc for this case.
>> For large copies from L2 I think adding a prefetch should be benign even on 
>> cores that don't need it, so if the benchmarks confirm this we should consider
>> updating the generic memcpy.
> 
> That is a call that ARM maintainers can take and is also another reason
> to separate the IFUNC infrastructure code from the thunderx change.

I checked only the memcpy change on a APM X-Gene 1 and results seems to show
improvements on aligned input, at least for sizes shorter thatn 4MB.  I would
like to check on more armv8 chips, but it does seems a nice improvement
over generic implementation.
memcpy
Length 65543, alignment  0/ 0:	4553.71
Length 65551, alignment  0/ 3:	11239.8
Length 65567, alignment  3/ 0:	11201.6
Length 65599, alignment  3/ 5:	11221.2
Length 131079, alignment  0/ 0:	9023.67
Length 131087, alignment  0/ 3:	22489.5
Length 131103, alignment  3/ 0:	22439.6
Length 131135, alignment  3/ 5:	22426.3
Length 262151, alignment  0/ 0:	21198.5
Length 262159, alignment  0/ 3:	48474
Length 262175, alignment  3/ 0:	48292.3
Length 262207, alignment  3/ 5:	48545.1
Length 524295, alignment  0/ 0:	43480.7
Length 524303, alignment  0/ 3:	93729.3
Length 524319, alignment  3/ 0:	93706.8
Length 524351, alignment  3/ 5:	93809.2
Length 1048583, alignment  0/ 0:	86732.2
Length 1048591, alignment  0/ 3:	187419
Length 1048607, alignment  3/ 0:	187153
Length 1048639, alignment  3/ 5:	187384
Length 2097159, alignment  0/ 0:	173630
Length 2097167, alignment  0/ 3:	373671
Length 2097183, alignment  3/ 0:	373776
Length 2097215, alignment  3/ 5:	374153
Length 4194311, alignment  0/ 0:	383575
Length 4194319, alignment  0/ 3:	752044
Length 4194335, alignment  3/ 0:	750919
Length 4194367, alignment  3/ 5:	751680
Length 8388615, alignment  0/ 0:	1.24695e+06
Length 8388623, alignment  0/ 3:	1.6407e+06
Length 8388639, alignment  3/ 0:	1.63961e+06
Length 8388671, alignment  3/ 5:	1.6407e+06
Length 16777223, alignment  0/ 0:	2.7774e+06
Length 16777231, alignment  0/ 3:	3.34092e+06
Length 16777247, alignment  3/ 0:	3.33036e+06
Length 16777279, alignment  3/ 5:	3.33811e+06
Length 33554439, alignment  0/ 0:	5.4628e+06
Length 33554447, alignment  0/ 3:	6.56429e+06
Length 33554463, alignment  3/ 0:	6.56451e+06
Length 33554495, alignment  3/ 5:	6.5654e+06
memcpy
Length 65543, alignment  0/ 0:	5590.23
Length 65551, alignment  0/ 3:	11171
Length 65567, alignment  3/ 0:	11146.2
Length 65599, alignment  3/ 5:	11154.1
Length 131079, alignment  0/ 0:	11109
Length 131087, alignment  0/ 3:	22266.3
Length 131103, alignment  3/ 0:	22296.1
Length 131135, alignment  3/ 5:	22257.1
Length 262151, alignment  0/ 0:	22780.6
Length 262159, alignment  0/ 3:	46212.7
Length 262175, alignment  3/ 0:	45999.7
Length 262207, alignment  3/ 5:	46221.3
Length 524295, alignment  0/ 0:	47787.3
Length 524303, alignment  0/ 3:	93263.7
Length 524319, alignment  3/ 0:	93028.3
Length 524351, alignment  3/ 5:	93301.5
Length 1048583, alignment  0/ 0:	95413.2
Length 1048591, alignment  0/ 3:	186367
Length 1048607, alignment  3/ 0:	185780
Length 1048639, alignment  3/ 5:	186296
Length 2097159, alignment  0/ 0:	190546
Length 2097167, alignment  0/ 3:	372310
Length 2097183, alignment  3/ 0:	371187
Length 2097215, alignment  3/ 5:	372281
Length 4194311, alignment  0/ 0:	379009
Length 4194319, alignment  0/ 3:	736763
Length 4194335, alignment  3/ 0:	733672
Length 4194367, alignment  3/ 5:	736531
Length 8388615, alignment  0/ 0:	1.26684e+06
Length 8388623, alignment  0/ 3:	1.61883e+06
Length 8388639, alignment  3/ 0:	1.6062e+06
Length 8388671, alignment  3/ 5:	1.61872e+06
Length 16777223, alignment  0/ 0:	2.68259e+06
Length 16777231, alignment  0/ 3:	3.24415e+06
Length 16777247, alignment  3/ 0:	3.23356e+06
Length 16777279, alignment  3/ 5:	3.2449e+06
Length 33554439, alignment  0/ 0:	5.47245e+06
Length 33554447, alignment  0/ 3:	6.56719e+06
Length 33554463, alignment  3/ 0:	6.55255e+06
Length 33554495, alignment  3/ 5:	6.56698e+06
  

Patch

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 29af8b1..4742a01 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -158,10 +158,13 @@  L(copy96):
 
 	.p2align 4
 L(copy_long):
+	cmp	count, #32768
+	b.lo	L(copy_long_without_prefetch)
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
+	prfm	pldl1strm, [src, 384]
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
@@ -169,7 +172,10 @@  L(copy_long):
 	ldp	C_l, C_h, [src, 48]
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
-	b.ls	2f
+
+L(prefetch_loop64):
+	tbz	src, #6, 1f
+	prfm	pldl1strm, [src, 512]
 1:
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
@@ -180,12 +186,39 @@  L(copy_long):
 	stp	D_l, D_h, [dst, 64]!
 	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 64
-	b.hi	1b
+	b.hi	L(prefetch_loop64)
+	b	L(last64)
+
+L(copy_long_without_prefetch):
+
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
 
 	/* Write the last full set of 64 bytes.  The remainder is at most 64
 	   bytes, so it is safe to always copy 64 bytes from the end even if
 	   there is just 1 byte left.  */
-2:
+L(last64):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [srcend, -48]