Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
Unroll the main loop. Large strings are around 20% faster on modern CPUs.
Passes regress.
---
Comments
The 01/12/2023 16:01, Wilco Dijkstra wrote:
> Unroll the main loop. Large strings are around 20% faster on modern CPUs.
> Passes regress.
please commit it, thanks.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
>
> ---
>
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index bc14dd9f79d1d0d17727cad522761807e7eef5b8..8b045a92c2b329c6351d523d9375a799d38d168e 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -30,7 +30,6 @@
> * MTE compatible.
> */
>
> -/* Arguments and results. */
> #define dstin x0
> #define srcin x1
> #define result x0
> @@ -76,14 +75,14 @@ ENTRY (STRCPY)
> ld1 {vdata.16b}, [src]
> cmeq vhas_nul.16b, vdata.16b, 0
> lsl shift, srcin, 2
> - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> + shrn vend.8b, vhas_nul.8h, 4
> fmov synd, dend
> lsr synd, synd, shift
> cbnz synd, L(tail)
>
> ldr dataq, [src, 16]!
> cmeq vhas_nul.16b, vdata.16b, 0
> - shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> + shrn vend.8b, vhas_nul.8h, 4
> fmov synd, dend
> cbz synd, L(start_loop)
>
> @@ -102,13 +101,10 @@ ENTRY (STRCPY)
> IFSTPCPY (add result, dstin, len)
> ret
>
> - .p2align 4,,8
> L(tail):
> rbit synd, synd
> clz len, synd
> lsr len, len, 2
> -
> - .p2align 4
> L(less16):
> tbz len, 3, L(less8)
> sub tmp, len, 7
> @@ -141,31 +137,37 @@ L(zerobyte):
>
> .p2align 4
> L(start_loop):
> - sub len, src, srcin
> + sub tmp, srcin, dstin
> ldr dataq2, [srcin]
> - add dst, dstin, len
> + sub dst, src, tmp
> str dataq2, [dstin]
> -
> - .p2align 5
> L(loop):
> - str dataq, [dst], 16
> - ldr dataq, [src, 16]!
> + str dataq, [dst], 32
> + ldr dataq, [src, 16]
> + cmeq vhas_nul.16b, vdata.16b, 0
> + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
> + fmov synd, dend
> + cbnz synd, L(loopend)
> + str dataq, [dst, -16]
> + ldr dataq, [src, 32]!
> cmeq vhas_nul.16b, vdata.16b, 0
> umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
> fmov synd, dend
> cbz synd, L(loop)
> -
> + add dst, dst, 16
> +L(loopend):
> shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> + sub dst, dst, 31
> #ifndef __AARCH64EB__
> rbit synd, synd
> #endif
> clz len, synd
> lsr len, len, 2
> - sub tmp, len, 15
> - ldr dataq, [src, tmp]
> - str dataq, [dst, tmp]
> - IFSTPCPY (add result, dst, len)
> + add dst, dst, len
> + ldr dataq, [dst, tmp]
> + str dataq, [dst]
> + IFSTPCPY (add result, dst, 15)
> ret
>
> END (STRCPY)
@@ -30,7 +30,6 @@
* MTE compatible.
*/
-/* Arguments and results. */
#define dstin x0
#define srcin x1
#define result x0
@@ -76,14 +75,14 @@ ENTRY (STRCPY)
ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
lsr synd, synd, shift
cbnz synd, L(tail)
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
cbz synd, L(start_loop)
@@ -102,13 +101,10 @@ ENTRY (STRCPY)
IFSTPCPY (add result, dstin, len)
ret
- .p2align 4,,8
L(tail):
rbit synd, synd
clz len, synd
lsr len, len, 2
-
- .p2align 4
L(less16):
tbz len, 3, L(less8)
sub tmp, len, 7
@@ -141,31 +137,37 @@ L(zerobyte):
.p2align 4
L(start_loop):
- sub len, src, srcin
+ sub tmp, srcin, dstin
ldr dataq2, [srcin]
- add dst, dstin, len
+ sub dst, src, tmp
str dataq2, [dstin]
-
- .p2align 5
L(loop):
- str dataq, [dst], 16
- ldr dataq, [src, 16]!
+ str dataq, [dst], 32
+ ldr dataq, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loopend)
+ str dataq, [dst, -16]
+ ldr dataq, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
-
+ add dst, dst, 16
+L(loopend):
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
+ sub dst, dst, 31
#ifndef __AARCH64EB__
rbit synd, synd
#endif
clz len, synd
lsr len, len, 2
- sub tmp, len, 15
- ldr dataq, [src, tmp]
- str dataq, [dst, tmp]
- IFSTPCPY (add result, dst, len)
+ add dst, dst, len
+ ldr dataq, [dst, tmp]
+ str dataq, [dst]
+ IFSTPCPY (add result, dst, 15)
ret
END (STRCPY)