AArch64: Optimize strlen

Message ID PAWPR08MB8982F6211E8FDA162ECB72D883FD9@PAWPR08MB8982.eurprd08.prod.outlook.com
State Committed
Commit 03c8ce5000198947a4dd7b2c14e5131738fda62b
Headers
Series AArch64: Optimize strlen |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Wilco Dijkstra Jan. 12, 2023, 3:53 p.m. UTC
  Optimize strlen by unrolling the main loop. Large strings are 64% faster on
modern CPUs.  Passes regress.

---
  

Comments

Szabolcs Nagy Jan. 13, 2023, 12:26 p.m. UTC | #1
The 01/12/2023 15:53, Wilco Dijkstra wrote:
> Optimize strlen by unrolling the main loop. Large strings are 64% faster on
> modern CPUs.  Passes regress.

please commit it, thanks.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>


> 
> ---
> 
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index b3c92d9dc9b3c52e29e05ebbb89b929f177dc2cf..133ef933425fa260e61642a7840d73391168507d 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -43,12 +43,9 @@
>  #define dend           d2
> 
>  /* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> -   per byte. We take 4 bits of every comparison byte with shift right and narrow
> -   by 4 instruction. Since the bits in the nibble mask reflect the order in
> -   which things occur in the original string, counting trailing zeros identifies
> -   exactly which byte matched.  */
> +   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
> +   four bits per byte using the shrn instruction. A count trailing zeros then
> +   identifies the first zero byte.  */
> 
>  ENTRY (STRLEN)
>         PTR_ARG (0)
> @@ -68,18 +65,25 @@ ENTRY (STRLEN)
> 
>         .p2align 5
>  L(loop):
> -       ldr     data, [src, 16]!
> +       ldr     data, [src, 16]
> +       cmeq    vhas_nul.16b, vdata.16b, 0
> +       umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
> +       fmov    synd, dend
> +       cbnz    synd, L(loop_end)
> +       ldr     data, [src, 32]!
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
>         fmov    synd, dend
>         cbz     synd, L(loop)
> -
> +       sub     src, src, 16
> +L(loop_end):
>         shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         sub     result, src, srcin
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
>         rbit    synd, synd
>  #endif
> +       add     result, result, 16
>         clz     tmp, synd
>         add     result, result, tmp, lsr 2
>         ret
>
  
Cupertino Miranda March 2, 2023, 11:49 a.m. UTC | #2
Hi Szabolcs,

I am attempting to reproduce the presented performance improvements on a
Ampere Altra processor.
Can you please detail some more what was your setup and how did you
measured it.

BTW, I am not looking to discredit the work, but rather to be able
to replicate the results on our end to evaluate backporting your patches.

Best regards,
Cupertino

Szabolcs Nagy via Libc-alpha writes:

> The 01/12/2023 15:53, Wilco Dijkstra wrote:
>> Optimize strlen by unrolling the main loop. Large strings are 64% faster on
>> modern CPUs.  Passes regress.
>
> please commit it, thanks.
>
> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
>
>
>>
>> ---
>>
>> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
>> index b3c92d9dc9b3c52e29e05ebbb89b929f177dc2cf..133ef933425fa260e61642a7840d73391168507d 100644
>> --- a/sysdeps/aarch64/strlen.S
>> +++ b/sysdeps/aarch64/strlen.S
>> @@ -43,12 +43,9 @@
>>  #define dend           d2
>>
>>  /* Core algorithm:
>> -
>> -   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
>> -   per byte. We take 4 bits of every comparison byte with shift right and narrow
>> -   by 4 instruction. Since the bits in the nibble mask reflect the order in
>> -   which things occur in the original string, counting trailing zeros identifies
>> -   exactly which byte matched.  */
>> +   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
>> +   four bits per byte using the shrn instruction. A count trailing zeros then
>> +   identifies the first zero byte.  */
>>
>>  ENTRY (STRLEN)
>>         PTR_ARG (0)
>> @@ -68,18 +65,25 @@ ENTRY (STRLEN)
>>
>>         .p2align 5
>>  L(loop):
>> -       ldr     data, [src, 16]!
>> +       ldr     data, [src, 16]
>> +       cmeq    vhas_nul.16b, vdata.16b, 0
>> +       umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
>> +       fmov    synd, dend
>> +       cbnz    synd, L(loop_end)
>> +       ldr     data, [src, 32]!
>>         cmeq    vhas_nul.16b, vdata.16b, 0
>>         umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
>>         fmov    synd, dend
>>         cbz     synd, L(loop)
>> -
>> +       sub     src, src, 16
>> +L(loop_end):
>>         shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>>         sub     result, src, srcin
>>         fmov    synd, dend
>>  #ifndef __AARCH64EB__
>>         rbit    synd, synd
>>  #endif
>> +       add     result, result, 16
>>         clz     tmp, synd
>>         add     result, result, tmp, lsr 2
>>         ret
>>
  
Wilco Dijkstra March 2, 2023, 1:07 p.m. UTC | #3
Hi Cupertino,

> I am attempting to reproduce the presented performance improvements on a
> Ampere Altra processor.
> Can you please detail some more what was your setup and how did you
> measured it.

I measured it on large strings (eg. 1024 bytes) on Neoverse V1. You can run
benchtests/bench-strlen.c and look at results for larger strings. However Altra
is half as wide as V1, so may not see much (or any) benefit. Also note by default
it uses the multiarch/strlen_asimd.S version, not strlen.S.

> BTW, I am not looking to discredit the work, but rather to be able
> to replicate the results on our end to evaluate backporting your patches.

Yes it has been a while since we backported string improvements, so it's a good
idea. Are you talking about the official GLIBC release branches or a private branch?

Cheers,
Wilco
  
Cupertino Miranda March 2, 2023, 2:29 p.m. UTC | #4
Wilco Dijkstra writes:

> Hi Cupertino,
>
>> I am attempting to reproduce the presented performance improvements on a
>> Ampere Altra processor.
>> Can you please detail some more what was your setup and how did you
>> measured it.
>
> I measured it on large strings (eg. 1024 bytes) on Neoverse V1. You can run
> benchtests/bench-strlen.c and look at results for larger strings. However Altra
> is half as wide as V1, so may not see much (or any) benefit.
Ok, will  check.
> Also note by default
> it uses the multiarch/strlen_asimd.S version, not strlen.S.
Oh, did not know that. Thanks !
>
>> BTW, I am not looking to discredit the work, but rather to be able
>> to replicate the results on our end to evaluate backporting your patches.
>
> Yes it has been a while since we backported string improvements, so it's a good
> idea. Are you talking about the official GLIBC release branches or a private branch?
I am refering to a private branch.
>
> Cheers,
> Wilco

Thanks,
Cupertino
  

Patch

diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index b3c92d9dc9b3c52e29e05ebbb89b929f177dc2cf..133ef933425fa260e61642a7840d73391168507d 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -43,12 +43,9 @@ 
 #define dend		d2
 
 /* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   which things occur in the original string, counting trailing zeros identifies
-   exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
 
 ENTRY (STRLEN)
 	PTR_ARG (0)
@@ -68,18 +65,25 @@  ENTRY (STRLEN)
 
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]!
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop_end)
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop)
-
+	sub	src, src, 16
+L(loop_end):
 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
 #endif
+	add	result, result, 16
 	clz	tmp, synd
 	add	result, result, tmp, lsr 2
 	ret