S390: Optimize __memcpy_z196.

Message ID 20200619134944.29699-1-stli@linux.ibm.com
State Committed
Headers
Series S390: Optimize __memcpy_z196. |

Commit Message

Stefan Liebler June 19, 2020, 1:49 p.m. UTC
  This patch introduces an extra loop without pfd instructions
as it turned out that the pfd instructions are usefull
for copies >=64KB but are counterproductive for smaller copies.
---
 sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)
  

Comments

Stefan Liebler June 25, 2020, 8:18 a.m. UTC | #1
Just as information, if nobody opposes, I'll commit this patch tomorrow.

On 6/19/20 3:49 PM, Stefan Liebler wrote:
> This patch introduces an extra loop without pfd instructions
> as it turned out that the pfd instructions are usefull
> for copies >=64KB but are counterproductive for smaller copies.
> ---
>  sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------
>  1 file changed, 15 insertions(+), 6 deletions(-)
> 
> diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
> index f2e9aaeb2d..dc2f491ec3 100644
> --- a/sysdeps/s390/memcpy-z900.S
> +++ b/sysdeps/s390/memcpy-z900.S
> @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
>  	je      .L_Z196_4
>  .L_Z196_start2:
>  	aghi    %r4,-1
> -	srlg    %r5,%r4,8
> -	ltgr    %r5,%r5
> +	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256
>  	jne     .L_Z196_5
>  .L_Z196_3:
>  	exrl    %r4,.L_Z196_14
>  .L_Z196_4:
>  	br      %r14
>  .L_Z196_5:
> -	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
> -	jh      __memcpy_mvcle
> +	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB
> +	jh	.L_Z196_6
>  .L_Z196_2:
> -	pfd     1,768(%r3)
> -	pfd     2,768(%r1)
>  	mvc     0(256,%r1),0(%r3)
>  	aghi    %r5,-1
>  	la      %r1,256(%r1)
>  	la      %r3,256(%r3)
>  	jne     .L_Z196_2
>  	j       .L_Z196_3
> +.L_Z196_6:
> +	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
> +	jh      __memcpy_mvcle
> +.L_Z196_7:
> +	pfd     1,1024(%r3)
> +	pfd     2,1024(%r1)
> +	mvc     0(256,%r1),0(%r3)
> +	aghi    %r5,-1
> +	la      %r1,256(%r1)
> +	la      %r3,256(%r3)
> +	jne     .L_Z196_7
> +	j       .L_Z196_3
>  .L_Z196_14:
>  	mvc     0(1,%r1),0(%r3)
>  END(MEMCPY_Z196)
>
  
Stefan Liebler June 26, 2020, 7:47 a.m. UTC | #2
committed

On 6/25/20 10:18 AM, Stefan Liebler via Libc-alpha wrote:
> Just as information, if nobody opposes, I'll commit this patch tomorrow.
> 
> On 6/19/20 3:49 PM, Stefan Liebler wrote:
>> This patch introduces an extra loop without pfd instructions
>> as it turned out that the pfd instructions are usefull
>> for copies >=64KB but are counterproductive for smaller copies.
>> ---
>>  sysdeps/s390/memcpy-z900.S | 21 +++++++++++++++------
>>  1 file changed, 15 insertions(+), 6 deletions(-)
>>
>> diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
>> index f2e9aaeb2d..dc2f491ec3 100644
>> --- a/sysdeps/s390/memcpy-z900.S
>> +++ b/sysdeps/s390/memcpy-z900.S
>> @@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
>>  	je      .L_Z196_4
>>  .L_Z196_start2:
>>  	aghi    %r4,-1
>> -	srlg    %r5,%r4,8
>> -	ltgr    %r5,%r5
>> +	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256
>>  	jne     .L_Z196_5
>>  .L_Z196_3:
>>  	exrl    %r4,.L_Z196_14
>>  .L_Z196_4:
>>  	br      %r14
>>  .L_Z196_5:
>> -	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
>> -	jh      __memcpy_mvcle
>> +	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB
>> +	jh	.L_Z196_6
>>  .L_Z196_2:
>> -	pfd     1,768(%r3)
>> -	pfd     2,768(%r1)
>>  	mvc     0(256,%r1),0(%r3)
>>  	aghi    %r5,-1
>>  	la      %r1,256(%r1)
>>  	la      %r3,256(%r3)
>>  	jne     .L_Z196_2
>>  	j       .L_Z196_3
>> +.L_Z196_6:
>> +	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
>> +	jh      __memcpy_mvcle
>> +.L_Z196_7:
>> +	pfd     1,1024(%r3)
>> +	pfd     2,1024(%r1)
>> +	mvc     0(256,%r1),0(%r3)
>> +	aghi    %r5,-1
>> +	la      %r1,256(%r1)
>> +	la      %r3,256(%r3)
>> +	jne     .L_Z196_7
>> +	j       .L_Z196_3
>>  .L_Z196_14:
>>  	mvc     0(1,%r1),0(%r3)
>>  END(MEMCPY_Z196)
>>
>
  

Patch

diff --git a/sysdeps/s390/memcpy-z900.S b/sysdeps/s390/memcpy-z900.S
index f2e9aaeb2d..dc2f491ec3 100644
--- a/sysdeps/s390/memcpy-z900.S
+++ b/sysdeps/s390/memcpy-z900.S
@@ -184,25 +184,34 @@  ENTRY(MEMCPY_Z196)
 	je      .L_Z196_4
 .L_Z196_start2:
 	aghi    %r4,-1
-	srlg    %r5,%r4,8
-	ltgr    %r5,%r5
+	risbg	%r5,%r4,8,128+63,56 # r0 = r5 / 256
 	jne     .L_Z196_5
 .L_Z196_3:
 	exrl    %r4,.L_Z196_14
 .L_Z196_4:
 	br      %r14
 .L_Z196_5:
-	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
-	jh      __memcpy_mvcle
+	cgfi	%r5,255		# Switch to loop with pfd for copies >=64kB
+	jh	.L_Z196_6
 .L_Z196_2:
-	pfd     1,768(%r3)
-	pfd     2,768(%r1)
 	mvc     0(256,%r1),0(%r3)
 	aghi    %r5,-1
 	la      %r1,256(%r1)
 	la      %r3,256(%r3)
 	jne     .L_Z196_2
 	j       .L_Z196_3
+.L_Z196_6:
+	cgfi    %r5,262144      # Switch to mvcle for copies >64MB
+	jh      __memcpy_mvcle
+.L_Z196_7:
+	pfd     1,1024(%r3)
+	pfd     2,1024(%r1)
+	mvc     0(256,%r1),0(%r3)
+	aghi    %r5,-1
+	la      %r1,256(%r1)
+	la      %r3,256(%r3)
+	jne     .L_Z196_7
+	j       .L_Z196_3
 .L_Z196_14:
 	mvc     0(1,%r1),0(%r3)
 END(MEMCPY_Z196)