S390: Optimize __memcpy_z196.
Commit Message
This patch introduces an extra loop without pfd instructions
as it turned out that the pfd instructions are usefull
for copies >=64KB but are counterproductive for smaller copies.
---
sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
Comments
Just as information, if nobody opposes, I'll commit this patch tomorrow.
On 6/19/20 3:49 PM, Stefan Liebler wrote:
> This patch introduces an extra loop without pfd instructions
> as it turned out that the pfd instructions are usefull
> for copies >=64KB but are counterproductive for smaller copies.
> ---
> sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------
> 1 file changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
> index f2e9aaeb2d..dc2f491ec3 100644
> --- a/sysdeps/s390/memcpy-z900.S
> +++ b/sysdeps/s390/memcpy-z900.S
> @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
> je .L_Z196_4
> .L_Z196_start2:
> aghi %r4,-1
> - srlg %r5,%r4,8
> - ltgr %r5,%r5
> + risbg %r5,%r4,8,128+63,56 # r0 = r5 / 256
> jne .L_Z196_5
> .L_Z196_3:
> exrl %r4,.L_Z196_14
> .L_Z196_4:
> br %r14
> .L_Z196_5:
> - cgfi %r5,262144 # Switch to mvcle for copies >64MB
> - jh __memcpy_mvcle
> + cgfi %r5,255 # Switch to loop with pfd for copies >=64kB
> + jh .L_Z196_6
> .L_Z196_2:
> - pfd 1,768(%r3)
> - pfd 2,768(%r1)
> mvc 0(256,%r1),0(%r3)
> aghi %r5,-1
> la %r1,256(%r1)
> la %r3,256(%r3)
> jne .L_Z196_2
> j .L_Z196_3
> +.L_Z196_6:
> + cgfi %r5,262144 # Switch to mvcle for copies >64MB
> + jh __memcpy_mvcle
> +.L_Z196_7:
> + pfd 1,1024(%r3)
> + pfd 2,1024(%r1)
> + mvc 0(256,%r1),0(%r3)
> + aghi %r5,-1
> + la %r1,256(%r1)
> + la %r3,256(%r3)
> + jne .L_Z196_7
> + j .L_Z196_3
> .L_Z196_14:
> mvc 0(1,%r1),0(%r3)
> END(MEMCPY_Z196)
>
committed
On 6/25/20 10:18 AM, Stefan Liebler via Libc-alpha wrote:
> Just as information, if nobody opposes, I'll commit this patch tomorrow.
>
> On 6/19/20 3:49 PM, Stefan Liebler wrote:
>> This patch introduces an extra loop without pfd instructions
>> as it turned out that the pfd instructions are usefull
>> for copies >=64KB but are counterproductive for smaller copies.
>> ---
>> sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------
>> 1 file changed, 15 insertions(+), 6 deletions(-)
>>
>> diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
>> index f2e9aaeb2d..dc2f491ec3 100644
>> --- a/sysdeps/s390/memcpy-z900.S
>> +++ b/sysdeps/s390/memcpy-z900.S
>> @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
>> je .L_Z196_4
>> .L_Z196_start2:
>> aghi %r4,-1
>> - srlg %r5,%r4,8
>> - ltgr %r5,%r5
>> + risbg %r5,%r4,8,128+63,56 # r0 = r5 / 256
>> jne .L_Z196_5
>> .L_Z196_3:
>> exrl %r4,.L_Z196_14
>> .L_Z196_4:
>> br %r14
>> .L_Z196_5:
>> - cgfi %r5,262144 # Switch to mvcle for copies >64MB
>> - jh __memcpy_mvcle
>> + cgfi %r5,255 # Switch to loop with pfd for copies >=64kB
>> + jh .L_Z196_6
>> .L_Z196_2:
>> - pfd 1,768(%r3)
>> - pfd 2,768(%r1)
>> mvc 0(256,%r1),0(%r3)
>> aghi %r5,-1
>> la %r1,256(%r1)
>> la %r3,256(%r3)
>> jne .L_Z196_2
>> j .L_Z196_3
>> +.L_Z196_6:
>> + cgfi %r5,262144 # Switch to mvcle for copies >64MB
>> + jh __memcpy_mvcle
>> +.L_Z196_7:
>> + pfd 1,1024(%r3)
>> + pfd 2,1024(%r1)
>> + mvc 0(256,%r1),0(%r3)
>> + aghi %r5,-1
>> + la %r1,256(%r1)
>> + la %r3,256(%r3)
>> + jne .L_Z196_7
>> + j .L_Z196_3
>> .L_Z196_14:
>> mvc 0(1,%r1),0(%r3)
>> END(MEMCPY_Z196)
>>
>
@@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
je .L_Z196_4
.L_Z196_start2:
aghi %r4,-1
- srlg %r5,%r4,8
- ltgr %r5,%r5
+ risbg %r5,%r4,8,128+63,56 # r0 = r5 / 256
jne .L_Z196_5
.L_Z196_3:
exrl %r4,.L_Z196_14
.L_Z196_4:
br %r14
.L_Z196_5:
- cgfi %r5,262144 # Switch to mvcle for copies >64MB
- jh __memcpy_mvcle
+ cgfi %r5,255 # Switch to loop with pfd for copies >=64kB
+ jh .L_Z196_6
.L_Z196_2:
- pfd 1,768(%r3)
- pfd 2,768(%r1)
mvc 0(256,%r1),0(%r3)
aghi %r5,-1
la %r1,256(%r1)
la %r3,256(%r3)
jne .L_Z196_2
j .L_Z196_3
+.L_Z196_6:
+ cgfi %r5,262144 # Switch to mvcle for copies >64MB
+ jh __memcpy_mvcle
+.L_Z196_7:
+ pfd 1,1024(%r3)
+ pfd 2,1024(%r1)
+ mvc 0(256,%r1),0(%r3)
+ aghi %r5,-1
+ la %r1,256(%r1)
+ la %r3,256(%r3)
+ jne .L_Z196_7
+ j .L_Z196_3
.L_Z196_14:
mvc 0(1,%r1),0(%r3)
END(MEMCPY_Z196)