Add ifunc memcpy and memmove for aarch64
Commit Message
On 07/02/2017 11:01, Siddhesh Poyarekar wrote:
> On Tuesday 07 February 2017 06:12 PM, Wilco Dijkstra wrote:
>> I agree we want to avoid using conditional compilation as much as possible.
>> On the other hand duplication is a bad idea too, I've seen too many cases where
>> bugs were only fixed in one of the N duplicates.
>
> Sure, but then in that case the de-duplication must be done by
> identifying a logical code block and make that into a macro to override
> and not just arbitrarily inject hunks of code. So in this case it could
> be alternate implementations of copy_long that is sufficient so #define
> COPY_LONG in both memcpy_generic and memcpy_thunderx and have the parent
> (memcpy.S) use that macro. In fact, that might even end up making the
> code a bit nicer to read.
>
>> However I'm actually wondering whether we need an ifunc for this case.
>> For large copies from L2 I think adding a prefetch should be benign even on
>> cores that don't need it, so if the benchmarks confirm this we should consider
>> updating the generic memcpy.
>
> That is a call that ARM maintainers can take and is also another reason
> to separate the IFUNC infrastructure code from the thunderx change.
I checked only the memcpy change on a APM X-Gene 1 and results seems to show
improvements on aligned input, at least for sizes shorter thatn 4MB. I would
like to check on more armv8 chips, but it does seems a nice improvement
over generic implementation.
memcpy
Length 65543, alignment 0/ 0: 4553.71
Length 65551, alignment 0/ 3: 11239.8
Length 65567, alignment 3/ 0: 11201.6
Length 65599, alignment 3/ 5: 11221.2
Length 131079, alignment 0/ 0: 9023.67
Length 131087, alignment 0/ 3: 22489.5
Length 131103, alignment 3/ 0: 22439.6
Length 131135, alignment 3/ 5: 22426.3
Length 262151, alignment 0/ 0: 21198.5
Length 262159, alignment 0/ 3: 48474
Length 262175, alignment 3/ 0: 48292.3
Length 262207, alignment 3/ 5: 48545.1
Length 524295, alignment 0/ 0: 43480.7
Length 524303, alignment 0/ 3: 93729.3
Length 524319, alignment 3/ 0: 93706.8
Length 524351, alignment 3/ 5: 93809.2
Length 1048583, alignment 0/ 0: 86732.2
Length 1048591, alignment 0/ 3: 187419
Length 1048607, alignment 3/ 0: 187153
Length 1048639, alignment 3/ 5: 187384
Length 2097159, alignment 0/ 0: 173630
Length 2097167, alignment 0/ 3: 373671
Length 2097183, alignment 3/ 0: 373776
Length 2097215, alignment 3/ 5: 374153
Length 4194311, alignment 0/ 0: 383575
Length 4194319, alignment 0/ 3: 752044
Length 4194335, alignment 3/ 0: 750919
Length 4194367, alignment 3/ 5: 751680
Length 8388615, alignment 0/ 0: 1.24695e+06
Length 8388623, alignment 0/ 3: 1.6407e+06
Length 8388639, alignment 3/ 0: 1.63961e+06
Length 8388671, alignment 3/ 5: 1.6407e+06
Length 16777223, alignment 0/ 0: 2.7774e+06
Length 16777231, alignment 0/ 3: 3.34092e+06
Length 16777247, alignment 3/ 0: 3.33036e+06
Length 16777279, alignment 3/ 5: 3.33811e+06
Length 33554439, alignment 0/ 0: 5.4628e+06
Length 33554447, alignment 0/ 3: 6.56429e+06
Length 33554463, alignment 3/ 0: 6.56451e+06
Length 33554495, alignment 3/ 5: 6.5654e+06
memcpy
Length 65543, alignment 0/ 0: 5590.23
Length 65551, alignment 0/ 3: 11171
Length 65567, alignment 3/ 0: 11146.2
Length 65599, alignment 3/ 5: 11154.1
Length 131079, alignment 0/ 0: 11109
Length 131087, alignment 0/ 3: 22266.3
Length 131103, alignment 3/ 0: 22296.1
Length 131135, alignment 3/ 5: 22257.1
Length 262151, alignment 0/ 0: 22780.6
Length 262159, alignment 0/ 3: 46212.7
Length 262175, alignment 3/ 0: 45999.7
Length 262207, alignment 3/ 5: 46221.3
Length 524295, alignment 0/ 0: 47787.3
Length 524303, alignment 0/ 3: 93263.7
Length 524319, alignment 3/ 0: 93028.3
Length 524351, alignment 3/ 5: 93301.5
Length 1048583, alignment 0/ 0: 95413.2
Length 1048591, alignment 0/ 3: 186367
Length 1048607, alignment 3/ 0: 185780
Length 1048639, alignment 3/ 5: 186296
Length 2097159, alignment 0/ 0: 190546
Length 2097167, alignment 0/ 3: 372310
Length 2097183, alignment 3/ 0: 371187
Length 2097215, alignment 3/ 5: 372281
Length 4194311, alignment 0/ 0: 379009
Length 4194319, alignment 0/ 3: 736763
Length 4194335, alignment 3/ 0: 733672
Length 4194367, alignment 3/ 5: 736531
Length 8388615, alignment 0/ 0: 1.26684e+06
Length 8388623, alignment 0/ 3: 1.61883e+06
Length 8388639, alignment 3/ 0: 1.6062e+06
Length 8388671, alignment 3/ 5: 1.61872e+06
Length 16777223, alignment 0/ 0: 2.68259e+06
Length 16777231, alignment 0/ 3: 3.24415e+06
Length 16777247, alignment 3/ 0: 3.23356e+06
Length 16777279, alignment 3/ 5: 3.2449e+06
Length 33554439, alignment 0/ 0: 5.47245e+06
Length 33554447, alignment 0/ 3: 6.56719e+06
Length 33554463, alignment 3/ 0: 6.55255e+06
Length 33554495, alignment 3/ 5: 6.56698e+06
@@ -158,10 +158,13 @@ L(copy96):
.p2align 4
L(copy_long):
+ cmp count, #32768
+ b.lo L(copy_long_without_prefetch)
and tmp1, dstin, 15
bic dst, dstin, 15
ldp D_l, D_h, [src]
sub src, src, tmp1
+ prfm pldl1strm, [src, 384]
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
stp D_l, D_h, [dstin]
@@ -169,7 +172,10 @@ L(copy_long):
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls 2f
+
+L(prefetch_loop64):
+ tbz src, #6, 1f
+ prfm pldl1strm, [src, 512]
1:
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
@@ -180,12 +186,39 @@ L(copy_long):
stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]!
subs count, count, 64
- b.hi 1b
+ b.hi L(prefetch_loop64)
+ b L(last64)
+
+L(copy_long_without_prefetch):
+
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(last64)
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the end even if
there is just 1 byte left. */
-2:
+L(last64):
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]