AArch64: Optimize strchr

Message ID PAWPR08MB89822DA55C9346018CE24E2083FD9@PAWPR08MB8982.eurprd08.prod.outlook.com
State Committed
Commit 51541a229740801882490177fa178e49264b13fb
Headers
Series AArch64: Optimize strchr |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Wilco Dijkstra Jan. 12, 2023, 3:58 p.m. UTC
  Simplify calculation of the mask using shrn. Unroll the main loop.
Small strings are 20% faster on modern CPUs.  Passes regress.

---
  

Comments

Szabolcs Nagy Jan. 13, 2023, 12:28 p.m. UTC | #1
The 01/12/2023 15:58, Wilco Dijkstra wrote:
> Simplify calculation of the mask using shrn. Unroll the main loop.
> Small strings are 20% faster on modern CPUs.  Passes regress.

please commit it, thanks.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>


> 
> ---
> 
> diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
> index 900ef15944c2b8a82943cc0fbdaf0b40907c40e1..14ae1513a7330a62cf5985d06e1fb6a8bab78d63 100644
> --- a/sysdeps/aarch64/strchr.S
> +++ b/sysdeps/aarch64/strchr.S
> @@ -32,8 +32,7 @@
> 
>  #define src            x2
>  #define tmp1           x1
> -#define wtmp2          w3
> -#define tmp3           x3
> +#define tmp2           x3
> 
>  #define vrepchr                v0
>  #define vdata          v1
> @@ -41,39 +40,30 @@
>  #define vhas_nul       v2
>  #define vhas_chr       v3
>  #define vrepmask       v4
> -#define vrepmask2      v5
> -#define vend           v6
> -#define dend           d6
> +#define vend           v5
> +#define dend           d5
> 
>  /* Core algorithm.
> 
>     For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
> -   requested character, bits 2-3 are set if the byte is NUL (or matched), and
> -   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
> -   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
> -   in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   per byte. Bits 0-1 are set if the relevant byte matched the requested
> +   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
> +   zeroes gives the position of the matching byte if it is a multiple of 4.
> +   If it is not a multiple of 4, there was no match.  */
> 
>  ENTRY (strchr)
>         PTR_ARG (0)
>         bic     src, srcin, 15
>         dup     vrepchr.16b, chrin
>         ld1     {vdata.16b}, [src]
> -       mov     wtmp2, 0x3003
> -       dup     vrepmask.8h, wtmp2
> +       movi    vrepmask.16b, 0x33
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> -       mov     wtmp2, 0xf00f
> -       dup     vrepmask2.8h, wtmp2
> -
>         bit     vhas_nul.16b, vhas_chr.16b, vrepmask.16b
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
> -       lsl     tmp3, srcin, 2
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64 */
> -
> +       lsl     tmp2, srcin, 2
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
> -       lsr     tmp1, tmp1, tmp3
> +       lsr     tmp1, tmp1, tmp2
>         cbz     tmp1, L(loop)
> 
>         rbit    tmp1, tmp1
> @@ -87,28 +77,34 @@ ENTRY (strchr)
> 
>         .p2align 4
>  L(loop):
> -       ldr     qdata, [src, 16]!
> +       ldr     qdata, [src, 16]
> +       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> +       cmhs    vhas_nul.16b, vhas_chr.16b, vdata.16b
> +       umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
> +       fmov    tmp1, dend
> +       cbnz    tmp1, L(end)
> +       ldr     qdata, [src, 32]!
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         cmhs    vhas_nul.16b, vhas_chr.16b, vdata.16b
>         umaxp   vend.16b, vhas_nul.16b, vhas_nul.16b
>         fmov    tmp1, dend
>         cbz     tmp1, L(loop)
> +       sub     src, src, 16
> +L(end):
> 
>  #ifdef __AARCH64EB__
>         bif     vhas_nul.16b, vhas_chr.16b, vrepmask.16b
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64 */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>  #else
>         bit     vhas_nul.16b, vhas_chr.16b, vrepmask.16b
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64 */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>         rbit    tmp1, tmp1
>  #endif
> +       add     src, src, 16
>         clz     tmp1, tmp1
> -       /* Tmp1 is an even multiple of 2 if the target character was
> -          found first. Otherwise we've found the end of string.  */
> +       /* Tmp1 is a multiple of 4 if the target character was found.  */
>         tst     tmp1, 2
>         add     result, src, tmp1, lsr 2
>         csel    result, result, xzr, eq
  

Patch

diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
index 900ef15944c2b8a82943cc0fbdaf0b40907c40e1..14ae1513a7330a62cf5985d06e1fb6a8bab78d63 100644
--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
@@ -32,8 +32,7 @@ 
 
 #define src		x2
 #define tmp1		x1
-#define wtmp2		w3
-#define tmp3		x3
+#define tmp2		x3
 
 #define vrepchr		v0
 #define vdata		v1
@@ -41,39 +40,30 @@ 
 #define vhas_nul	v2
 #define vhas_chr	v3
 #define vrepmask	v4
-#define vrepmask2	v5
-#define vend		v6
-#define dend		d6
+#define vend		v5
+#define dend		d5
 
 /* Core algorithm.
 
    For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
-   requested character, bits 2-3 are set if the byte is NUL (or matched), and
-   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
-   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
-   in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   per byte. Bits 0-1 are set if the relevant byte matched the requested
+   character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+   zeroes gives the position of the matching byte if it is a multiple of 4.
+   If it is not a multiple of 4, there was no match.  */
 
 ENTRY (strchr)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	wtmp2, 0x3003
-	dup	vrepmask.8h, wtmp2
+	movi	vrepmask.16b, 0x33
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
-	mov	wtmp2, 0xf00f
-	dup	vrepmask2.8h, wtmp2
-
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	lsl	tmp3, srcin, 2
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
-
+	lsl	tmp2, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
-	lsr	tmp1, tmp1, tmp3
+	lsr	tmp1, tmp1, tmp2
 	cbz	tmp1, L(loop)
 
 	rbit	tmp1, tmp1
@@ -87,28 +77,34 @@  ENTRY (strchr)
 
 	.p2align 4
 L(loop):
-	ldr	qdata, [src, 16]!
+	ldr	qdata, [src, 16]
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	tmp1, dend
+	cbnz	tmp1, L(end)
+	ldr	qdata, [src, 32]!
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
 	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
+	sub	src, src, 16
+L(end):
 
 #ifdef __AARCH64EB__
 	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #else
 	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	rbit	tmp1, tmp1
 #endif
+	add	src, src, 16
 	clz	tmp1, tmp1
-	/* Tmp1 is an even multiple of 2 if the target character was
-	   found first. Otherwise we've found the end of string.  */
+	/* Tmp1 is a multiple of 4 if the target character was found.  */
 	tst	tmp1, 2
 	add	result, src, tmp1, lsr 2
 	csel	result, result, xzr, eq