AArch64: Improve strrchr

Message ID PAWPR08MB898215EB1E88D436D9EF27D283FD9@PAWPR08MB8982.eurprd08.prod.outlook.com
State Committed
Commit 55599d480437dcf129b41b95be32b48f2a9e5da9
Headers
Series AArch64: Improve strrchr |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Wilco Dijkstra Jan. 12, 2023, 4:05 p.m. UTC
  Use shrn for narrowing the mask which simplifies code and speeds up small
strings. Unroll the first search loop to improve performance on large strings.
Passes regress.

---
  

Comments

Szabolcs Nagy Jan. 13, 2023, 12:30 p.m. UTC | #1
The 01/12/2023 16:05, Wilco Dijkstra wrote:
> Use shrn for narrowing the mask which simplifies code and speeds up small
> strings. Unroll the first search loop to improve performance on large strings.
> Passes regress.

please commit it, thanks.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>


> 
> ---
> 
> diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
> index b4b8b44aadf75d900589e25ebbc49640ef43f0bf..528034a5183b4479634dfb346426102d6aa4b64a 100644
> --- a/sysdeps/aarch64/strrchr.S
> +++ b/sysdeps/aarch64/strrchr.S
> @@ -22,19 +22,16 @@
> 
>  /* Assumptions:
>   *
> - * ARMv8-a, AArch64
> - * Neon Available.
> + * ARMv8-a, AArch64, Advanced SIMD.
>   * MTE compatible.
>   */
> 
> -/* Arguments and results.  */
>  #define srcin          x0
>  #define chrin          w1
>  #define result         x0
> 
>  #define src            x2
>  #define tmp            x3
> -#define wtmp           w3
>  #define synd           x3
>  #define shift          x4
>  #define src_match      x4
> @@ -46,7 +43,6 @@
>  #define vhas_nul       v2
>  #define vhas_chr       v3
>  #define vrepmask       v4
> -#define vrepmask2      v5
>  #define vend           v5
>  #define dend           d5
> 
> @@ -58,59 +54,71 @@
>     the relevant byte matched the requested character; bits 2-3 are set
>     if the relevant byte matched the NUL end of string.  */
> 
> -ENTRY(strrchr)
> +ENTRY (strrchr)
>         PTR_ARG (0)
>         bic     src, srcin, 15
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0x3003
> -       dup     vrepmask.8h, wtmp
> -       tst     srcin, 15
> -       beq     L(loop1)
> -
> -       ld1     {vdata.16b}, [src], 16
> +       movi    vrepmask.16b, 0x33
> +       ld1     {vdata.16b}, [src]
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask2.8h, wtmp
>         bit     vhas_nul.16b, vhas_chr.16b, vrepmask.16b
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4
>         lsl     shift, srcin, 2
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         lsl     synd, synd, shift
>         ands    nul_match, synd, 0xcccccccccccccccc
>         bne     L(tail)
> -       cbnz    synd, L(loop2)
> +       cbnz    synd, L(loop2_start)
> 
> -       .p2align 5
> +       .p2align 4
>  L(loop1):
> -       ld1     {vdata.16b}, [src], 16
> +       ldr     q1, [src, 16]
> +       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> +       cmhs    vhas_nul.16b, vhas_chr.16b, vdata.16b
> +       umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
> +       fmov    synd, dend
> +       cbnz    synd, L(loop1_end)
> +       ldr     q1, [src, 32]!
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         cmhs    vhas_nul.16b, vhas_chr.16b, vdata.16b
>         umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
>         fmov    synd, dend
>         cbz     synd, L(loop1)
> -
> +       sub     src, src, 16
> +L(loop1_end):
> +       add     src, src, 16
>         cmeq    vhas_nul.16b, vdata.16b, 0
> +#ifdef __AARCH64EB__
> +       bif     vhas_nul.16b, vhas_chr.16b, vrepmask.16b
> +       shrn    vend.8b, vhas_nul.8h, 4
> +       fmov    synd, dend
> +       rbit    synd, synd
> +#else
>         bit     vhas_nul.16b, vhas_chr.16b, vrepmask.16b
> -       bic     vhas_nul.8h, 0x0f, lsl 8
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4
>         fmov    synd, dend
> +#endif
>         ands    nul_match, synd, 0xcccccccccccccccc
> -       beq     L(loop2)
> -
> +       beq     L(loop2_start)
>  L(tail):
>         sub     nul_match, nul_match, 1
>         and     chr_match, synd, 0x3333333333333333
>         ands    chr_match, chr_match, nul_match
> -       sub     result, src, 1
> +       add     result, src, 15
>         clz     tmp, chr_match
>         sub     result, result, tmp, lsr 2
>         csel    result, result, xzr, ne
>         ret
> 
>         .p2align 4
> +       nop
> +       nop
> +L(loop2_start):
> +       add     src, src, 16
> +       bic     vrepmask.8h, 0xf0
> +
>  L(loop2):
>         cmp     synd, 0
>         csel    src_match, src, src_match, ne
  

Patch

diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
index b4b8b44aadf75d900589e25ebbc49640ef43f0bf..528034a5183b4479634dfb346426102d6aa4b64a 100644
--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
@@ -22,19 +22,16 @@ 
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
  * MTE compatible.
  */
 
-/* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
 #define result		x0
 
 #define src		x2
 #define tmp		x3
-#define wtmp		w3
 #define synd		x3
 #define shift		x4
 #define src_match	x4
@@ -46,7 +43,6 @@ 
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
 #define vend		v5
 #define dend		d5
 
@@ -58,59 +54,71 @@ 
    the relevant byte matched the requested character; bits 2-3 are set
    if the relevant byte matched the NUL end of string.  */
 
-ENTRY(strrchr)
+ENTRY (strrchr)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0x3003
-	dup	vrepmask.8h, wtmp
-	tst	srcin, 15
-	beq	L(loop1)
-
-	ld1	{vdata.16b}, [src], 16
+	movi	vrepmask.16b, 0x33
+	ld1	{vdata.16b}, [src]
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp, 0xf00f
-	dup	vrepmask2.8h, wtmp
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	lsl	shift, srcin, 2
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	lsl	synd, synd, shift
 	ands	nul_match, synd, 0xcccccccccccccccc
 	bne	L(tail)
-	cbnz	synd, L(loop2)
+	cbnz	synd, L(loop2_start)
 
-	.p2align 5
+	.p2align 4
 L(loop1):
-	ld1	{vdata.16b}, [src], 16
+	ldr	q1, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loop1_end)
+	ldr	q1, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(loop1)
-
+	sub	src, src, 16
+L(loop1_end):
+	add	src, src, 16
 	cmeq	vhas_nul.16b, vdata.16b, 0
+#ifdef __AARCH64EB__
+	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	rbit	synd, synd
+#else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	bic	vhas_nul.8h, 0x0f, lsl 8
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
+#endif
 	ands	nul_match, synd, 0xcccccccccccccccc
-	beq	L(loop2)
-
+	beq	L(loop2_start)
 L(tail):
 	sub	nul_match, nul_match, 1
 	and	chr_match, synd, 0x3333333333333333
 	ands	chr_match, chr_match, nul_match
-	sub	result, src, 1
+	add	result, src, 15
 	clz	tmp, chr_match
 	sub	result, result, tmp, lsr 2
 	csel	result, result, xzr, ne
 	ret
 
 	.p2align 4
+	nop
+	nop
+L(loop2_start):
+	add	src, src, 16
+	bic	vrepmask.8h, 0xf0
+
 L(loop2):
 	cmp	synd, 0
 	csel	src_match, src, src_match, ne