aarch64: Optimize string functions with shrn instruction

Message ID 20220627161213.1081512-1-danilak@google.com
State Committed
Commit 3c9980698988ef64072f1fac339b180f52792faf
Headers
Series aarch64: Optimize string functions with shrn instruction |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Danila Kutenin June 27, 2022, 4:12 p.m. UTC
  We found that string functions were using AND+ADDP
to find the nibble/syndrome mask but there is an easier
opportunity through `SHRN dst.8b, src.8h, 4` (shift
right every 2 bytes by 4 and narrow to 1 byte) and has
same latency on all SIMD ARMv8 targets as ADDP. There
are also possible gaps for memcmp but that's for
another patch.

We see 10-20% savings for small-mid size cases (<=128)
which are primary cases for general workloads.
---
 sysdeps/aarch64/memchr.S    | 25 +++++++++----------------
 sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
 sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
 sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
 sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
 sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
 6 files changed, 59 insertions(+), 102 deletions(-)
  

Comments

Danila Kutenin July 6, 2022, 7:42 a.m. UTC | #1
Friendly ping

On Mon, Jun 27, 2022 at 5:12 PM Danila Kutenin <danilak@google.com> wrote:

> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst.8b, src.8h, 4` (shift
> right every 2 bytes by 4 and narrow to 1 byte) and has
> same latency on all SIMD ARMv8 targets as ADDP. There
> are also possible gaps for memcmp but that's for
> another patch.
>
> We see 10-20% savings for small-mid size cases (<=128)
> which are primary cases for general workloads.
> ---
>  sysdeps/aarch64/memchr.S    | 25 +++++++++----------------
>  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
>  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
>  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
>  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
>  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
>  6 files changed, 59 insertions(+), 102 deletions(-)
>
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..2053a977b6 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -41,24 +41,21 @@
>  #define synd           x5
>  #define shift          x6
>  #define        tmp             x7
> -#define wtmp           w7
>
>  #define vrepchr                v0
>  #define qdata          q1
>  #define vdata          v1
>  #define vhas_chr       v2
> -#define vrepmask       v3
> -#define vend           v4
> -#define dend           d4
> +#define vend           v3
> +#define dend           d3
>
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (MEMCHR)
>         PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (MEMCHR)
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src]
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         lsl     shift, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -111,8 +105,7 @@ L(loop32_2):
>         fmov    synd, dend
>         cbz     synd, L(loop32)
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         add     tmp, srcin, cntin
>         sub     cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
>  #define synd           x5
>  #define shift          x6
>  #define        tmp             x7
> -#define wtmp           w7
>  #define end            x8
>  #define endm1          x9
>
> @@ -45,18 +44,16 @@
>  #define qdata          q1
>  #define vdata          v1
>  #define vhas_chr       v2
> -#define vrepmask       v3
> -#define vend           v4
> -#define dend           d4
> +#define vend           v3
> +#define dend           d3
>
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__memrchr)
>         PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src]
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         neg     shift, end, lsl 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsl     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
>         fmov    synd, dend
>         cbz     synd, L(loop32)
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>
>         add     tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
>  #define src            x2
>  #define tmp1           x1
>  #define tmp2           x3
> -#define tmp2w          w3
>
>  #define vrepchr                v0
>  #define vdata          v1
>  #define qdata          q1
>  #define vhas_nul       v2
>  #define vhas_chr       v3
> -#define vrepmask       v4
> -#define vend           v5
> -#define dend           d5
> +#define vend           v4
> +#define dend           d4
>
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__strchrnul)
>         PTR_ARG (0)
>         bic     src, srcin, 15
>         dup     vrepchr.16b, chrin
>         ld1     {vdata.16b}, [src]
> -       mov     tmp2w, 0xf00f
> -       dup     vrepmask.8h, tmp2w
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
>         lsl     tmp2, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>         lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
>         cbz     tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
>         fmov    tmp1, dend
>         cbz     tmp1, L(loop)
>
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>  #ifndef __AARCH64EB__
>         rbit    tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
>  #define len            x4
>  #define synd           x4
>  #define        tmp             x5
> -#define wtmp           w5
>  #define shift          x5
>  #define data1          x6
>  #define dataw1         w6
> @@ -50,9 +49,8 @@
>  #define dataq          q0
>  #define vdata          v0
>  #define vhas_nul       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>  #define dataq2         q1
>
>  #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
>  # define IFSTPCPY(X,...)
>  #endif
>
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (STRCPY)
>         PTR_ARG (0)
>         PTR_ARG (1)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         ld1     {vdata.16b}, [src]
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbnz    synd, L(tail)
>
>         ldr     dataq, [src, 16]!
>         cmeq    vhas_nul.16b, vdata.16b, 0
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         cbz     synd, L(start_loop)
>
> @@ -162,8 +155,7 @@ L(loop):
>         fmov    synd, dend
>         cbz     synd, L(loop)
>
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
>         rbit    synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
>  #define src            x1
>  #define        synd            x2
>  #define tmp            x3
> -#define wtmp           w3
>  #define shift          x4
>
>  #define data           q0
>  #define vdata          v0
>  #define vhas_nul       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>
>  /* Core algorithm:
>
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting trailing zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (STRLEN)
>         PTR_ARG (0)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         ld1     {vdata.16b}, [src]
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
>         fmov    synd, dend
>         cbz     synd, L(loop)
>
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         sub     result, src, srcin
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
>  #define src            x2
>  #define synd           x3
>  #define        shift           x4
> -#define wtmp           w4
>  #define tmp            x4
>  #define cntrem         x5
>
>  #define qdata          q0
>  #define vdata          v0
>  #define vhas_chr       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>
>  /*
>     Core algorithm:
>
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting trailing zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__strnlen)
>         PTR_ARG (0)
>         SIZE_ARG (1)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src], 16
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
>         cbz     synd, L(loop32)
>
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         sub     src, src, 16
>         mov     synd, vend.d[0]
>         sub     result, src, srcin
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
>
  
Szabolcs Nagy July 6, 2022, 8:30 a.m. UTC | #2
The 07/06/2022 08:42, Danila Kutenin wrote:
> Friendly ping
> 
> On Mon, Jun 27, 2022 at 5:12 PM Danila Kutenin <danilak@google.com> wrote:
> 
> > We found that string functions were using AND+ADDP
> > to find the nibble/syndrome mask but there is an easier
> > opportunity through `SHRN dst.8b, src.8h, 4` (shift
> > right every 2 bytes by 4 and narrow to 1 byte) and has
> > same latency on all SIMD ARMv8 targets as ADDP. There
> > are also possible gaps for memcmp but that's for
> > another patch.
> >
> > We see 10-20% savings for small-mid size cases (<=128)
> > which are primary cases for general workloads.


thanks for the patch, committed it now.
  

Patch

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index b060eee97d..2053a977b6 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -41,24 +41,21 @@ 
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (MEMCHR)
 	PTR_ARG (0)
@@ -67,12 +64,9 @@  ENTRY (MEMCHR)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -111,8 +105,7 @@  L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, srcin, cntin
 	sub	cntrem, tmp, src
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index e0efbad91c..5179320720 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -37,7 +37,6 @@ 
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -45,18 +44,16 @@ 
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr)
 	PTR_ARG (0)
@@ -67,12 +64,9 @@  ENTRY (__memrchr)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -109,8 +103,7 @@  L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index 442726fd49..ee154ab74b 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -33,38 +33,32 @@ 
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -83,8 +77,7 @@  L(loop):
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index da53170ece..78d27b4aa6 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -40,7 +40,6 @@ 
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@@ -50,9 +49,8 @@ 
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
@@ -63,34 +61,29 @@ 
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
@@ -162,8 +155,7 @@  L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index a2310871c2..3a5d088407 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -34,35 +34,29 @@ 
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRLEN)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -80,8 +74,7 @@  L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 0dbecb0ce9..282bddc9aa 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -33,39 +33,33 @@ 
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strnlen)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -103,8 +97,7 @@  L(loop32_2):
 	cbz	synd, L(loop32)
 
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	src, src, 16
 	mov	synd, vend.d[0]
 	sub	result, src, srcin