aarch64: Optimize string functions with shrn instruction

Message ID 20220620174628.2820531-1-danilak@google.com
State Superseded
Headers
Series aarch64: Optimize string functions with shrn instruction |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Danila Kutenin June 20, 2022, 5:46 p.m. UTC
  From: Danila Kutenin <kutdanila@yandex.ru>

We found that string functions were using AND+ADDP
to find the nibble/syndrome mask but there is an easier
opportunity through `SHRN dst, src, 4` and has same
latency on all SIMD ARMv8 targets as ADDP. There are also
gaps for memcmp but that's probably for another patch

We see 10-20% savings for small-mid size cases which are
primary cases for general workloads https://pastebin.com/hA5Fd8eM

I don't have commit rights, asking maintainers to do that

Signed-off-by: Danila Kutenin <danilak@google.com>
---
 sysdeps/aarch64/memchr.S    | 19 +++++++------------
 sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
 sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
 sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
 sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
 sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
 6 files changed, 57 insertions(+), 98 deletions(-)
  

Comments

Szabolcs Nagy June 21, 2022, 9:07 a.m. UTC | #1
The 06/20/2022 17:46, Danila Kutenin wrote:
> From: Danila Kutenin <kutdanila@yandex.ru>
> 
> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst, src, 4` and has same
> latency on all SIMD ARMv8 targets as ADDP. There are also
> gaps for memcmp but that's probably for another patch
> 
> We see 10-20% savings for small-mid size cases which are
> primary cases for general workloads https://pastebin.com/hA5Fd8eM
> 
> I don't have commit rights, asking maintainers to do that
> 
> Signed-off-by: Danila Kutenin <danilak@google.com>

is this a contribution from google or yandex or personal?

(e.g. if your company has copyright assignment with fsf then
you dont need signed-off-by, otherwise it's better to have
the email address consistent with the author address)

> ---
>  sysdeps/aarch64/memchr.S    | 19 +++++++------------
>  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
>  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
>  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
>  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
>  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
>  6 files changed, 57 insertions(+), 98 deletions(-)
> 
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..b983489491 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -53,12 +53,11 @@
>  
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (MEMCHR)
>  	PTR_ARG (0)
> @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src]
>  	dup	vrepchr.16b, chrin
> -	mov	wtmp, 0xf00f
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	lsl	shift, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -111,8 +107,7 @@ L(loop32_2):
>  	fmov	synd, dend
>  	cbz	synd, L(loop32)
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	add	tmp, srcin, cntin
>  	sub	cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
>  #define synd		x5
>  #define shift		x6
>  #define	tmp		x7
> -#define wtmp		w7
>  #define end		x8
>  #define endm1		x9
>  
> @@ -45,18 +44,16 @@
>  #define qdata		q1
>  #define vdata		v1
>  #define vhas_chr	v2
> -#define vrepmask	v3
> -#define vend		v4
> -#define dend		d4
> +#define vend		v3
> +#define dend		d3
>  
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__memrchr)
>  	PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src]
>  	dup	vrepchr.16b, chrin
> -	mov	wtmp, 0xf00f
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	neg	shift, end, lsl 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsl	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
>  	fmov	synd, dend
>  	cbz	synd, L(loop32)
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  
>  	add	tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
>  #define src		x2
>  #define tmp1		x1
>  #define tmp2		x3
> -#define tmp2w		w3
>  
>  #define vrepchr		v0
>  #define vdata		v1
>  #define qdata		q1
>  #define vhas_nul	v2
>  #define vhas_chr	v3
> -#define vrepmask	v4
> -#define vend		v5
> -#define dend		d5
> +#define vend		v4
> +#define dend		d4
>  
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__strchrnul)
>  	PTR_ARG (0)
>  	bic	src, srcin, 15
>  	dup	vrepchr.16b, chrin
>  	ld1	{vdata.16b}, [src]
> -	mov	tmp2w, 0xf00f
> -	dup	vrepmask.8h, tmp2w
>  	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
>  	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
>  	lsl	tmp2, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	tmp1, dend
>  	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
>  	cbz	tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
>  	fmov	tmp1, dend
>  	cbz	tmp1, L(loop)
>  
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	tmp1, dend
>  #ifndef __AARCH64EB__
>  	rbit	tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
>  #define len		x4
>  #define synd		x4
>  #define	tmp		x5
> -#define wtmp		w5
>  #define shift		x5
>  #define data1		x6
>  #define dataw1		w6
> @@ -50,9 +49,8 @@
>  #define dataq		q0
>  #define vdata		v0
>  #define vhas_nul	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  #define dataq2		q1
>  
>  #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
>  # define IFSTPCPY(X,...)
>  #endif
>  
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting leading zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (STRCPY)
>  	PTR_ARG (0)
>  	PTR_ARG (1)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	ld1	{vdata.16b}, [src]
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_nul.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbnz	synd, L(tail)
>  
>  	ldr	dataq, [src, 16]!
>  	cmeq	vhas_nul.16b, vdata.16b, 0
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	cbz	synd, L(start_loop)
>  
> @@ -162,8 +155,7 @@ L(loop):
>  	fmov	synd, dend
>  	cbz	synd, L(loop)
>  
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  #ifndef __AARCH64EB__
>  	rbit	synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
>  #define src		x1
>  #define	synd		x2
>  #define tmp		x3
> -#define wtmp		w3
>  #define shift		x4
>  
>  #define data		q0
>  #define vdata		v0
>  #define vhas_nul	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  
>  /* Core algorithm:
>  
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting trailing zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (STRLEN)
>  	PTR_ARG (0)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	ld1	{vdata.16b}, [src]
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_nul.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
>  	fmov	synd, dend
>  	cbz	synd, L(loop)
>  
> -	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	sub	result, src, srcin
>  	fmov	synd, dend
>  #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
>  #define src		x2
>  #define synd		x3
>  #define	shift		x4
> -#define wtmp		w4
>  #define tmp		x4
>  #define cntrem		x5
>  
>  #define qdata		q0
>  #define vdata		v0
>  #define vhas_chr	v1
> -#define vrepmask	v2
> -#define vend		v3
> -#define dend		d3
> +#define vend		v2
> +#define dend		d2
>  
>  /*
>     Core algorithm:
>  
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since the
> -   bits in the syndrome reflect the order in which things occur in the original
> -   string, counting trailing zeros identifies exactly which byte matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order in
> +   which things occur in the original string, counting trailing zeros identifies
> +   exactly which byte matched.  */
>  
>  ENTRY (__strnlen)
>  	PTR_ARG (0)
>  	SIZE_ARG (1)
>  	bic	src, srcin, 15
> -	mov	wtmp, 0xf00f
>  	cbz	cntin, L(nomatch)
>  	ld1	{vdata.16b}, [src], 16
> -	dup	vrepmask.8h, wtmp
>  	cmeq	vhas_chr.16b, vdata.16b, 0
>  	lsl	shift, srcin, 2
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
>  	cbz	synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
>  	cbz	synd, L(loop32)
>  
>  L(end):
> -	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
> +	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
>  	sub	src, src, 16
>  	mov	synd, vend.d[0]
>  	sub	result, src, srcin
> -- 
> 2.37.0.rc0.104.g0611611a94-goog
>
  
Danila Kutenin June 21, 2022, 9:28 a.m. UTC | #2
It's a contribution from Google. Google has a copyright assignment with
fsf, I think this should cover it.

Sorry for the email confusion, I realized the mess quite late: my git
client was configured with yandex and the commit was set up with the
google's account. Yandex has nothing to do with this work. If needed, I can
recreate the patch

On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:

> The 06/20/2022 17:46, Danila Kutenin wrote:
> > From: Danila Kutenin <kutdanila@yandex.ru>
> >
> > We found that string functions were using AND+ADDP
> > to find the nibble/syndrome mask but there is an easier
> > opportunity through `SHRN dst, src, 4` and has same
> > latency on all SIMD ARMv8 targets as ADDP. There are also
> > gaps for memcmp but that's probably for another patch
> >
> > We see 10-20% savings for small-mid size cases which are
> > primary cases for general workloads https://pastebin.com/hA5Fd8eM
> >
> > I don't have commit rights, asking maintainers to do that
> >
> > Signed-off-by: Danila Kutenin <danilak@google.com>
>
> is this a contribution from google or yandex or personal?
>
> (e.g. if your company has copyright assignment with fsf then
> you dont need signed-off-by, otherwise it's better to have
> the email address consistent with the author address)
>
> > ---
> >  sysdeps/aarch64/memchr.S    | 19 +++++++------------
> >  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
> >  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> >  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
> >  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
> >  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
> >  6 files changed, 57 insertions(+), 98 deletions(-)
> >
> > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > index b060eee97d..b983489491 100644
> > --- a/sysdeps/aarch64/memchr.S
> > +++ b/sysdeps/aarch64/memchr.S
> > @@ -53,12 +53,11 @@
> >
> >  /*
> >     Core algorithm:
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting leading zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (MEMCHR)
> >       PTR_ARG (0)
> > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> >       cbz     cntin, L(nomatch)
> >       ld1     {vdata.16b}, [src]
> >       dup     vrepchr.16b, chrin
> > -     mov     wtmp, 0xf00f
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> >       lsl     shift, srcin, 2
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsr     synd, synd, shift
> >       cbz     synd, L(start_loop)
> > @@ -111,8 +107,7 @@ L(loop32_2):
> >       fmov    synd, dend
> >       cbz     synd, L(loop32)
> >  L(end):
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       add     tmp, srcin, cntin
> >       sub     cntrem, tmp, src
> > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> > --- a/sysdeps/aarch64/memrchr.S
> > +++ b/sysdeps/aarch64/memrchr.S
> > @@ -37,7 +37,6 @@
> >  #define synd         x5
> >  #define shift                x6
> >  #define      tmp             x7
> > -#define wtmp         w7
> >  #define end          x8
> >  #define endm1                x9
> >
> > @@ -45,18 +44,16 @@
> >  #define qdata                q1
> >  #define vdata                v1
> >  #define vhas_chr     v2
> > -#define vrepmask     v3
> > -#define vend         v4
> > -#define dend         d4
> > +#define vend         v3
> > +#define dend         d3
> >
> >  /*
> >     Core algorithm:
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting leading zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (__memrchr)
> >       PTR_ARG (0)
> > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> >       cbz     cntin, L(nomatch)
> >       ld1     {vdata.16b}, [src]
> >       dup     vrepchr.16b, chrin
> > -     mov     wtmp, 0xf00f
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> >       neg     shift, end, lsl 2
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsl     synd, synd, shift
> >       cbz     synd, L(start_loop)
> > @@ -109,8 +103,7 @@ L(loop32_2):
> >       fmov    synd, dend
> >       cbz     synd, L(loop32)
> >  L(end):
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >
> >       add     tmp, src, 15
> > diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> > index 442726fd49..ee154ab74b 100644
> > --- a/sysdeps/aarch64/strchrnul.S
> > +++ b/sysdeps/aarch64/strchrnul.S
> > @@ -33,38 +33,32 @@
> >  #define src          x2
> >  #define tmp1         x1
> >  #define tmp2         x3
> > -#define tmp2w                w3
> >
> >  #define vrepchr              v0
> >  #define vdata                v1
> >  #define qdata                q1
> >  #define vhas_nul     v2
> >  #define vhas_chr     v3
> > -#define vrepmask     v4
> > -#define vend         v5
> > -#define dend         d5
> > +#define vend         v4
> > +#define dend         d4
> >
> > -/* Core algorithm:
> > -
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +/*
> > +   Core algorithm:
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting leading zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (__strchrnul)
> >       PTR_ARG (0)
> >       bic     src, srcin, 15
> >       dup     vrepchr.16b, chrin
> >       ld1     {vdata.16b}, [src]
> > -     mov     tmp2w, 0xf00f
> > -     dup     vrepmask.8h, tmp2w
> >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> >       cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
> >       lsl     tmp2, srcin, 2
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    tmp1, dend
> >       lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
> >       cbz     tmp1, L(loop)
> > @@ -83,8 +77,7 @@ L(loop):
> >       fmov    tmp1, dend
> >       cbz     tmp1, L(loop)
> >
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    tmp1, dend
> >  #ifndef __AARCH64EB__
> >       rbit    tmp1, tmp1
> > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > index da53170ece..78d27b4aa6 100644
> > --- a/sysdeps/aarch64/strcpy.S
> > +++ b/sysdeps/aarch64/strcpy.S
> > @@ -40,7 +40,6 @@
> >  #define len          x4
> >  #define synd         x4
> >  #define      tmp             x5
> > -#define wtmp         w5
> >  #define shift                x5
> >  #define data1                x6
> >  #define dataw1               w6
> > @@ -50,9 +49,8 @@
> >  #define dataq                q0
> >  #define vdata                v0
> >  #define vhas_nul     v1
> > -#define vrepmask     v2
> > -#define vend         v3
> > -#define dend         d3
> > +#define vend         v2
> > +#define dend         d2
> >  #define dataq2               q1
> >
> >  #ifdef BUILD_STPCPY
> > @@ -63,34 +61,29 @@
> >  # define IFSTPCPY(X,...)
> >  #endif
> >
> > -/* Core algorithm:
> > -
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +/*
> > +   Core algorithm:
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting leading zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (STRCPY)
> >       PTR_ARG (0)
> >       PTR_ARG (1)
> >       bic     src, srcin, 15
> > -     mov     wtmp, 0xf00f
> >       ld1     {vdata.16b}, [src]
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_nul.16b, vdata.16b, 0
> >       lsl     shift, srcin, 2
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsr     synd, synd, shift
> >       cbnz    synd, L(tail)
> >
> >       ldr     dataq, [src, 16]!
> >       cmeq    vhas_nul.16b, vdata.16b, 0
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       cbz     synd, L(start_loop)
> >
> > @@ -162,8 +155,7 @@ L(loop):
> >       fmov    synd, dend
> >       cbz     synd, L(loop)
> >
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >  #ifndef __AARCH64EB__
> >       rbit    synd, synd
> > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > index a2310871c2..3a5d088407 100644
> > --- a/sysdeps/aarch64/strlen.S
> > +++ b/sysdeps/aarch64/strlen.S
> > @@ -34,35 +34,29 @@
> >  #define src          x1
> >  #define      synd            x2
> >  #define tmp          x3
> > -#define wtmp         w3
> >  #define shift                x4
> >
> >  #define data         q0
> >  #define vdata                v0
> >  #define vhas_nul     v1
> > -#define vrepmask     v2
> > -#define vend         v3
> > -#define dend         d3
> > +#define vend         v2
> > +#define dend         d2
> >
> >  /* Core algorithm:
> >
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting trailing zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (STRLEN)
> >       PTR_ARG (0)
> >       bic     src, srcin, 15
> > -     mov     wtmp, 0xf00f
> >       ld1     {vdata.16b}, [src]
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_nul.16b, vdata.16b, 0
> >       lsl     shift, srcin, 2
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsr     synd, synd, shift
> >       cbz     synd, L(loop)
> > @@ -80,8 +74,7 @@ L(loop):
> >       fmov    synd, dend
> >       cbz     synd, L(loop)
> >
> > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> >       sub     result, src, srcin
> >       fmov    synd, dend
> >  #ifndef __AARCH64EB__
> > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > index 0dbecb0ce9..282bddc9aa 100644
> > --- a/sysdeps/aarch64/strnlen.S
> > +++ b/sysdeps/aarch64/strnlen.S
> > @@ -33,39 +33,33 @@
> >  #define src          x2
> >  #define synd         x3
> >  #define      shift           x4
> > -#define wtmp         w4
> >  #define tmp          x4
> >  #define cntrem               x5
> >
> >  #define qdata                q0
> >  #define vdata                v0
> >  #define vhas_chr     v1
> > -#define vrepmask     v2
> > -#define vend         v3
> > -#define dend         d3
> > +#define vend         v2
> > +#define dend         d2
> >
> >  /*
> >     Core algorithm:
> >
> > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > -   set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > -   bits in the syndrome reflect the order in which things occur in the
> original
> > -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > +   per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > +   by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > +   which things occur in the original string, counting trailing zeros
> identifies
> > +   exactly which byte matched.  */
> >
> >  ENTRY (__strnlen)
> >       PTR_ARG (0)
> >       SIZE_ARG (1)
> >       bic     src, srcin, 15
> > -     mov     wtmp, 0xf00f
> >       cbz     cntin, L(nomatch)
> >       ld1     {vdata.16b}, [src], 16
> > -     dup     vrepmask.8h, wtmp
> >       cmeq    vhas_chr.16b, vdata.16b, 0
> >       lsl     shift, srcin, 2
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       fmov    synd, dend
> >       lsr     synd, synd, shift
> >       cbz     synd, L(start_loop)
> > @@ -103,8 +97,7 @@ L(loop32_2):
> >       cbz     synd, L(loop32)
> >
> >  L(end):
> > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> >       sub     src, src, 16
> >       mov     synd, vend.d[0]
> >       sub     result, src, srcin
> > --
> > 2.37.0.rc0.104.g0611611a94-goog
> >
>
  
Szabolcs Nagy June 22, 2022, 6:48 a.m. UTC | #3
The 06/21/2022 10:28, Danila Kutenin wrote:
> It's a contribution from Google. Google has a copyright assignment with
> fsf, I think this should cover it.

note that if you are interested in getting the same improvements
into bionic, musl, llvm-libc, newlib,...
then arm maintains an optimized-routines repo on github for that
purpose and you are welcome to contribute your changes there too.

> 
> Sorry for the email confusion, I realized the mess quite late: my git
> client was configured with yandex and the commit was set up with the
> google's account. Yandex has nothing to do with this work. If needed, I can
> recreate the patch

please do, we should not commit pastebin links into the git history.
(just list the measured improvements over the old code in the commit
message, if it's too long then select representative measurements
or aggregate them in some other way.)

> 
> On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:
> 
> > The 06/20/2022 17:46, Danila Kutenin wrote:
> > > From: Danila Kutenin <kutdanila@yandex.ru>
> > >
> > > We found that string functions were using AND+ADDP
> > > to find the nibble/syndrome mask but there is an easier
> > > opportunity through `SHRN dst, src, 4` and has same
> > > latency on all SIMD ARMv8 targets as ADDP. There are also
> > > gaps for memcmp but that's probably for another patch
> > >
> > > We see 10-20% savings for small-mid size cases which are
> > > primary cases for general workloads https://pastebin.com/hA5Fd8eM

this is good improvement.
we will do some checks (on various cpus).

thanks.


> > >
> > > I don't have commit rights, asking maintainers to do that
> > >
> > > Signed-off-by: Danila Kutenin <danilak@google.com>
> >
> > is this a contribution from google or yandex or personal?
> >
> > (e.g. if your company has copyright assignment with fsf then
> > you dont need signed-off-by, otherwise it's better to have
> > the email address consistent with the author address)
> >
> > > ---
> > >  sysdeps/aarch64/memchr.S    | 19 +++++++------------
> > >  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
> > >  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> > >  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
> > >  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
> > >  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
> > >  6 files changed, 57 insertions(+), 98 deletions(-)
> > >
> > > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > > index b060eee97d..b983489491 100644
> > > --- a/sysdeps/aarch64/memchr.S
> > > +++ b/sysdeps/aarch64/memchr.S
> > > @@ -53,12 +53,11 @@
> > >
> > >  /*
> > >     Core algorithm:
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting leading zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (MEMCHR)
> > >       PTR_ARG (0)
> > > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> > >       cbz     cntin, L(nomatch)
> > >       ld1     {vdata.16b}, [src]
> > >       dup     vrepchr.16b, chrin
> > > -     mov     wtmp, 0xf00f
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > >       lsl     shift, srcin, 2
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsr     synd, synd, shift
> > >       cbz     synd, L(start_loop)
> > > @@ -111,8 +107,7 @@ L(loop32_2):
> > >       fmov    synd, dend
> > >       cbz     synd, L(loop32)
> > >  L(end):
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       add     tmp, srcin, cntin
> > >       sub     cntrem, tmp, src
> > > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > > index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> > > --- a/sysdeps/aarch64/memrchr.S
> > > +++ b/sysdeps/aarch64/memrchr.S
> > > @@ -37,7 +37,6 @@
> > >  #define synd         x5
> > >  #define shift                x6
> > >  #define      tmp             x7
> > > -#define wtmp         w7
> > >  #define end          x8
> > >  #define endm1                x9
> > >
> > > @@ -45,18 +44,16 @@
> > >  #define qdata                q1
> > >  #define vdata                v1
> > >  #define vhas_chr     v2
> > > -#define vrepmask     v3
> > > -#define vend         v4
> > > -#define dend         d4
> > > +#define vend         v3
> > > +#define dend         d3
> > >
> > >  /*
> > >     Core algorithm:
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting leading zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (__memrchr)
> > >       PTR_ARG (0)
> > > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> > >       cbz     cntin, L(nomatch)
> > >       ld1     {vdata.16b}, [src]
> > >       dup     vrepchr.16b, chrin
> > > -     mov     wtmp, 0xf00f
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > >       neg     shift, end, lsl 2
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsl     synd, synd, shift
> > >       cbz     synd, L(start_loop)
> > > @@ -109,8 +103,7 @@ L(loop32_2):
> > >       fmov    synd, dend
> > >       cbz     synd, L(loop32)
> > >  L(end):
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >
> > >       add     tmp, src, 15
> > > diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> > > index 442726fd49..ee154ab74b 100644
> > > --- a/sysdeps/aarch64/strchrnul.S
> > > +++ b/sysdeps/aarch64/strchrnul.S
> > > @@ -33,38 +33,32 @@
> > >  #define src          x2
> > >  #define tmp1         x1
> > >  #define tmp2         x3
> > > -#define tmp2w                w3
> > >
> > >  #define vrepchr              v0
> > >  #define vdata                v1
> > >  #define qdata                q1
> > >  #define vhas_nul     v2
> > >  #define vhas_chr     v3
> > > -#define vrepmask     v4
> > > -#define vend         v5
> > > -#define dend         d5
> > > +#define vend         v4
> > > +#define dend         d4
> > >
> > > -/* Core algorithm:
> > > -
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +/*
> > > +   Core algorithm:
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting leading zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (__strchrnul)
> > >       PTR_ARG (0)
> > >       bic     src, srcin, 15
> > >       dup     vrepchr.16b, chrin
> > >       ld1     {vdata.16b}, [src]
> > > -     mov     tmp2w, 0xf00f
> > > -     dup     vrepmask.8h, tmp2w
> > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > >       cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
> > >       lsl     tmp2, srcin, 2
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    tmp1, dend
> > >       lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
> > >       cbz     tmp1, L(loop)
> > > @@ -83,8 +77,7 @@ L(loop):
> > >       fmov    tmp1, dend
> > >       cbz     tmp1, L(loop)
> > >
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    tmp1, dend
> > >  #ifndef __AARCH64EB__
> > >       rbit    tmp1, tmp1
> > > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > > index da53170ece..78d27b4aa6 100644
> > > --- a/sysdeps/aarch64/strcpy.S
> > > +++ b/sysdeps/aarch64/strcpy.S
> > > @@ -40,7 +40,6 @@
> > >  #define len          x4
> > >  #define synd         x4
> > >  #define      tmp             x5
> > > -#define wtmp         w5
> > >  #define shift                x5
> > >  #define data1                x6
> > >  #define dataw1               w6
> > > @@ -50,9 +49,8 @@
> > >  #define dataq                q0
> > >  #define vdata                v0
> > >  #define vhas_nul     v1
> > > -#define vrepmask     v2
> > > -#define vend         v3
> > > -#define dend         d3
> > > +#define vend         v2
> > > +#define dend         d2
> > >  #define dataq2               q1
> > >
> > >  #ifdef BUILD_STPCPY
> > > @@ -63,34 +61,29 @@
> > >  # define IFSTPCPY(X,...)
> > >  #endif
> > >
> > > -/* Core algorithm:
> > > -
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +/*
> > > +   Core algorithm:
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting leading zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (STRCPY)
> > >       PTR_ARG (0)
> > >       PTR_ARG (1)
> > >       bic     src, srcin, 15
> > > -     mov     wtmp, 0xf00f
> > >       ld1     {vdata.16b}, [src]
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > >       lsl     shift, srcin, 2
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsr     synd, synd, shift
> > >       cbnz    synd, L(tail)
> > >
> > >       ldr     dataq, [src, 16]!
> > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       cbz     synd, L(start_loop)
> > >
> > > @@ -162,8 +155,7 @@ L(loop):
> > >       fmov    synd, dend
> > >       cbz     synd, L(loop)
> > >
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >  #ifndef __AARCH64EB__
> > >       rbit    synd, synd
> > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > > index a2310871c2..3a5d088407 100644
> > > --- a/sysdeps/aarch64/strlen.S
> > > +++ b/sysdeps/aarch64/strlen.S
> > > @@ -34,35 +34,29 @@
> > >  #define src          x1
> > >  #define      synd            x2
> > >  #define tmp          x3
> > > -#define wtmp         w3
> > >  #define shift                x4
> > >
> > >  #define data         q0
> > >  #define vdata                v0
> > >  #define vhas_nul     v1
> > > -#define vrepmask     v2
> > > -#define vend         v3
> > > -#define dend         d3
> > > +#define vend         v2
> > > +#define dend         d2
> > >
> > >  /* Core algorithm:
> > >
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting trailing zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (STRLEN)
> > >       PTR_ARG (0)
> > >       bic     src, srcin, 15
> > > -     mov     wtmp, 0xf00f
> > >       ld1     {vdata.16b}, [src]
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > >       lsl     shift, srcin, 2
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsr     synd, synd, shift
> > >       cbz     synd, L(loop)
> > > @@ -80,8 +74,7 @@ L(loop):
> > >       fmov    synd, dend
> > >       cbz     synd, L(loop)
> > >
> > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > >       sub     result, src, srcin
> > >       fmov    synd, dend
> > >  #ifndef __AARCH64EB__
> > > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > > index 0dbecb0ce9..282bddc9aa 100644
> > > --- a/sysdeps/aarch64/strnlen.S
> > > +++ b/sysdeps/aarch64/strnlen.S
> > > @@ -33,39 +33,33 @@
> > >  #define src          x2
> > >  #define synd         x3
> > >  #define      shift           x4
> > > -#define wtmp         w4
> > >  #define tmp          x4
> > >  #define cntrem               x5
> > >
> > >  #define qdata                q0
> > >  #define vdata                v0
> > >  #define vhas_chr     v1
> > > -#define vrepmask     v2
> > > -#define vend         v3
> > > -#define dend         d3
> > > +#define vend         v2
> > > +#define dend         d2
> > >
> > >  /*
> > >     Core algorithm:
> > >
> > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > -   bits in the syndrome reflect the order in which things occur in the
> > original
> > > -   string, counting trailing zeros identifies exactly which byte
> > matched.  */
> > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > +   per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > +   which things occur in the original string, counting trailing zeros
> > identifies
> > > +   exactly which byte matched.  */
> > >
> > >  ENTRY (__strnlen)
> > >       PTR_ARG (0)
> > >       SIZE_ARG (1)
> > >       bic     src, srcin, 15
> > > -     mov     wtmp, 0xf00f
> > >       cbz     cntin, L(nomatch)
> > >       ld1     {vdata.16b}, [src], 16
> > > -     dup     vrepmask.8h, wtmp
> > >       cmeq    vhas_chr.16b, vdata.16b, 0
> > >       lsl     shift, srcin, 2
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       fmov    synd, dend
> > >       lsr     synd, synd, shift
> > >       cbz     synd, L(start_loop)
> > > @@ -103,8 +97,7 @@ L(loop32_2):
> > >       cbz     synd, L(loop32)
> > >
> > >  L(end):
> > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> > */
> > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > >       sub     src, src, 16
> > >       mov     synd, vend.d[0]
> > >       sub     result, src, srcin
> > > --
> > > 2.37.0.rc0.104.g0611611a94-goog
> > >
> >
  
Danila Kutenin June 22, 2022, 7:31 a.m. UTC | #4
>
> note that if you are interested in getting the same improvements
> into bionic, musl, llvm-libc, newlib,...
> then arm maintains an optimized-routines repo on github for that
> purpose and you are welcome to contribute your changes there too.


We have communicated with llvm-libc and bionic, for others will reach out,
thanks!

 please do, we should not commit pastebin links into the git history.
> (just list the measured improvements over the old code in the commit
> message, if it's too long then select representative measurements
> or aggregate them in some other way.)


Created a new one, this one should be abandoned. I still was recommended to
write a Sign-off line at the end by my employer

this is good improvement.
> we will do some checks (on various cpus).


 Thanks!

On Wed, Jun 22, 2022 at 7:49 AM Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:

> The 06/21/2022 10:28, Danila Kutenin wrote:
> > It's a contribution from Google. Google has a copyright assignment with
> > fsf, I think this should cover it.
>
> note that if you are interested in getting the same improvements
> into bionic, musl, llvm-libc, newlib,...
> then arm maintains an optimized-routines repo on github for that
> purpose and you are welcome to contribute your changes there too.
>
> >
> > Sorry for the email confusion, I realized the mess quite late: my git
> > client was configured with yandex and the commit was set up with the
> > google's account. Yandex has nothing to do with this work. If needed, I
> can
> > recreate the patch
>
> please do, we should not commit pastebin links into the git history.
> (just list the measured improvements over the old code in the commit
> message, if it's too long then select representative measurements
> or aggregate them in some other way.)
>
> >
> > On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:
> >
> > > The 06/20/2022 17:46, Danila Kutenin wrote:
> > > > From: Danila Kutenin <kutdanila@yandex.ru>
> > > >
> > > > We found that string functions were using AND+ADDP
> > > > to find the nibble/syndrome mask but there is an easier
> > > > opportunity through `SHRN dst, src, 4` and has same
> > > > latency on all SIMD ARMv8 targets as ADDP. There are also
> > > > gaps for memcmp but that's probably for another patch
> > > >
> > > > We see 10-20% savings for small-mid size cases which are
> > > > primary cases for general workloads https://pastebin.com/hA5Fd8eM
>
> this is good improvement.
> we will do some checks (on various cpus).
>
> thanks.
>
>
> > > >
> > > > I don't have commit rights, asking maintainers to do that
> > > >
> > > > Signed-off-by: Danila Kutenin <danilak@google.com>
> > >
> > > is this a contribution from google or yandex or personal?
> > >
> > > (e.g. if your company has copyright assignment with fsf then
> > > you dont need signed-off-by, otherwise it's better to have
> > > the email address consistent with the author address)
> > >
> > > > ---
> > > >  sysdeps/aarch64/memchr.S    | 19 +++++++------------
> > > >  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
> > > >  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> > > >  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
> > > >  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
> > > >  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
> > > >  6 files changed, 57 insertions(+), 98 deletions(-)
> > > >
> > > > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > > > index b060eee97d..b983489491 100644
> > > > --- a/sysdeps/aarch64/memchr.S
> > > > +++ b/sysdeps/aarch64/memchr.S
> > > > @@ -53,12 +53,11 @@
> > > >
> > > >  /*
> > > >     Core algorithm:
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting leading zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (MEMCHR)
> > > >       PTR_ARG (0)
> > > > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> > > >       cbz     cntin, L(nomatch)
> > > >       ld1     {vdata.16b}, [src]
> > > >       dup     vrepchr.16b, chrin
> > > > -     mov     wtmp, 0xf00f
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > > >       lsl     shift, srcin, 2
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsr     synd, synd, shift
> > > >       cbz     synd, L(start_loop)
> > > > @@ -111,8 +107,7 @@ L(loop32_2):
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(loop32)
> > > >  L(end):
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       add     tmp, srcin, cntin
> > > >       sub     cntrem, tmp, src
> > > > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > > > index e0efbad91c..5179320720 <(517)%20932-0720> <(517)%20932-0720>
> 100644
> > > > --- a/sysdeps/aarch64/memrchr.S
> > > > +++ b/sysdeps/aarch64/memrchr.S
> > > > @@ -37,7 +37,6 @@
> > > >  #define synd         x5
> > > >  #define shift                x6
> > > >  #define      tmp             x7
> > > > -#define wtmp         w7
> > > >  #define end          x8
> > > >  #define endm1                x9
> > > >
> > > > @@ -45,18 +44,16 @@
> > > >  #define qdata                q1
> > > >  #define vdata                v1
> > > >  #define vhas_chr     v2
> > > > -#define vrepmask     v3
> > > > -#define vend         v4
> > > > -#define dend         d4
> > > > +#define vend         v3
> > > > +#define dend         d3
> > > >
> > > >  /*
> > > >     Core algorithm:
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting leading zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (__memrchr)
> > > >       PTR_ARG (0)
> > > > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> > > >       cbz     cntin, L(nomatch)
> > > >       ld1     {vdata.16b}, [src]
> > > >       dup     vrepchr.16b, chrin
> > > > -     mov     wtmp, 0xf00f
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > > >       neg     shift, end, lsl 2
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsl     synd, synd, shift
> > > >       cbz     synd, L(start_loop)
> > > > @@ -109,8 +103,7 @@ L(loop32_2):
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(loop32)
> > > >  L(end):
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >
> > > >       add     tmp, src, 15
> > > > diff --git a/sysdeps/aarch64/strchrnul.S
> b/sysdeps/aarch64/strchrnul.S
> > > > index 442726fd49..ee154ab74b 100644
> > > > --- a/sysdeps/aarch64/strchrnul.S
> > > > +++ b/sysdeps/aarch64/strchrnul.S
> > > > @@ -33,38 +33,32 @@
> > > >  #define src          x2
> > > >  #define tmp1         x1
> > > >  #define tmp2         x3
> > > > -#define tmp2w                w3
> > > >
> > > >  #define vrepchr              v0
> > > >  #define vdata                v1
> > > >  #define qdata                q1
> > > >  #define vhas_nul     v2
> > > >  #define vhas_chr     v3
> > > > -#define vrepmask     v4
> > > > -#define vend         v5
> > > > -#define dend         d5
> > > > +#define vend         v4
> > > > +#define dend         d4
> > > >
> > > > -/* Core algorithm:
> > > > -
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +/*
> > > > +   Core algorithm:
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting leading zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (__strchrnul)
> > > >       PTR_ARG (0)
> > > >       bic     src, srcin, 15
> > > >       dup     vrepchr.16b, chrin
> > > >       ld1     {vdata.16b}, [src]
> > > > -     mov     tmp2w, 0xf00f
> > > > -     dup     vrepmask.8h, tmp2w
> > > >       cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
> > > >       cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
> > > >       lsl     tmp2, srcin, 2
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    tmp1, dend
> > > >       lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
> > > >       cbz     tmp1, L(loop)
> > > > @@ -83,8 +77,7 @@ L(loop):
> > > >       fmov    tmp1, dend
> > > >       cbz     tmp1, L(loop)
> > > >
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    tmp1, dend
> > > >  #ifndef __AARCH64EB__
> > > >       rbit    tmp1, tmp1
> > > > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > > > index da53170ece..78d27b4aa6 100644
> > > > --- a/sysdeps/aarch64/strcpy.S
> > > > +++ b/sysdeps/aarch64/strcpy.S
> > > > @@ -40,7 +40,6 @@
> > > >  #define len          x4
> > > >  #define synd         x4
> > > >  #define      tmp             x5
> > > > -#define wtmp         w5
> > > >  #define shift                x5
> > > >  #define data1                x6
> > > >  #define dataw1               w6
> > > > @@ -50,9 +49,8 @@
> > > >  #define dataq                q0
> > > >  #define vdata                v0
> > > >  #define vhas_nul     v1
> > > > -#define vrepmask     v2
> > > > -#define vend         v3
> > > > -#define dend         d3
> > > > +#define vend         v2
> > > > +#define dend         d2
> > > >  #define dataq2               q1
> > > >
> > > >  #ifdef BUILD_STPCPY
> > > > @@ -63,34 +61,29 @@
> > > >  # define IFSTPCPY(X,...)
> > > >  #endif
> > > >
> > > > -/* Core algorithm:
> > > > -
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +/*
> > > > +   Core algorithm:
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting leading zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (STRCPY)
> > > >       PTR_ARG (0)
> > > >       PTR_ARG (1)
> > > >       bic     src, srcin, 15
> > > > -     mov     wtmp, 0xf00f
> > > >       ld1     {vdata.16b}, [src]
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > > >       lsl     shift, srcin, 2
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsr     synd, synd, shift
> > > >       cbnz    synd, L(tail)
> > > >
> > > >       ldr     dataq, [src, 16]!
> > > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(start_loop)
> > > >
> > > > @@ -162,8 +155,7 @@ L(loop):
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(loop)
> > > >
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >  #ifndef __AARCH64EB__
> > > >       rbit    synd, synd
> > > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > > > index a2310871c2..3a5d088407 100644
> > > > --- a/sysdeps/aarch64/strlen.S
> > > > +++ b/sysdeps/aarch64/strlen.S
> > > > @@ -34,35 +34,29 @@
> > > >  #define src          x1
> > > >  #define      synd            x2
> > > >  #define tmp          x3
> > > > -#define wtmp         w3
> > > >  #define shift                x4
> > > >
> > > >  #define data         q0
> > > >  #define vdata                v0
> > > >  #define vhas_nul     v1
> > > > -#define vrepmask     v2
> > > > -#define vend         v3
> > > > -#define dend         d3
> > > > +#define vend         v2
> > > > +#define dend         d2
> > > >
> > > >  /* Core algorithm:
> > > >
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting trailing
> zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (STRLEN)
> > > >       PTR_ARG (0)
> > > >       bic     src, srcin, 15
> > > > -     mov     wtmp, 0xf00f
> > > >       ld1     {vdata.16b}, [src]
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_nul.16b, vdata.16b, 0
> > > >       lsl     shift, srcin, 2
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsr     synd, synd, shift
> > > >       cbz     synd, L(loop)
> > > > @@ -80,8 +74,7 @@ L(loop):
> > > >       fmov    synd, dend
> > > >       cbz     synd, L(loop)
> > > >
> > > > -     and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
> > > >       sub     result, src, srcin
> > > >       fmov    synd, dend
> > > >  #ifndef __AARCH64EB__
> > > > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > > > index 0dbecb0ce9..282bddc9aa 100644
> > > > --- a/sysdeps/aarch64/strnlen.S
> > > > +++ b/sysdeps/aarch64/strnlen.S
> > > > @@ -33,39 +33,33 @@
> > > >  #define src          x2
> > > >  #define synd         x3
> > > >  #define      shift           x4
> > > > -#define wtmp         w4
> > > >  #define tmp          x4
> > > >  #define cntrem               x5
> > > >
> > > >  #define qdata                q0
> > > >  #define vdata                v0
> > > >  #define vhas_chr     v1
> > > > -#define vrepmask     v2
> > > > -#define vend         v3
> > > > -#define dend         d3
> > > > +#define vend         v2
> > > > +#define dend         d2
> > > >
> > > >  /*
> > > >     Core algorithm:
> > > >
> > > > -   For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > -   requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > -   set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > -   bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > -   string, counting trailing zeros identifies exactly which byte
> > > matched.  */
> > > > +   For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > +   per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > +   by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > +   which things occur in the original string, counting trailing
> zeros
> > > identifies
> > > > +   exactly which byte matched.  */
> > > >
> > > >  ENTRY (__strnlen)
> > > >       PTR_ARG (0)
> > > >       SIZE_ARG (1)
> > > >       bic     src, srcin, 15
> > > > -     mov     wtmp, 0xf00f
> > > >       cbz     cntin, L(nomatch)
> > > >       ld1     {vdata.16b}, [src], 16
> > > > -     dup     vrepmask.8h, wtmp
> > > >       cmeq    vhas_chr.16b, vdata.16b, 0
> > > >       lsl     shift, srcin, 2
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       fmov    synd, dend
> > > >       lsr     synd, synd, shift
> > > >       cbz     synd, L(start_loop)
> > > > @@ -103,8 +97,7 @@ L(loop32_2):
> > > >       cbz     synd, L(loop32)
> > > >
> > > >  L(end):
> > > > -     and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > -     addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /*
> 128->64
> > > */
> > > > +     shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
> > > >       sub     src, src, 16
> > > >       mov     synd, vend.d[0]
> > > >       sub     result, src, srcin
> > > > --
> > > > 2.37.0.rc0.104.g0611611a94-goog
> > > >
> > >
>
  
Szabolcs Nagy June 22, 2022, 8:40 a.m. UTC | #5
The 06/22/2022 08:31, Danila Kutenin wrote:
> >
> > note that if you are interested in getting the same improvements
> > into bionic, musl, llvm-libc, newlib,...
> > then arm maintains an optimized-routines repo on github for that
> > purpose and you are welcome to contribute your changes there too.
> 
> 
> We have communicated with llvm-libc and bionic, for others will reach out,
> thanks!

note, many projects take code from arm optimized-routines, we
don't have an exact list. e.g. bionic uses it directly, if you
fork that then merging fixes later will be more expensive.

>  please do, we should not commit pastebin links into the git history.
> > (just list the measured improvements over the old code in the commit
> > message, if it's too long then select representative measurements
> > or aggregate them in some other way.)
> 
> 
> Created a new one, this one should be abandoned. I still was recommended to
> write a Sign-off line at the end by my employer

i will wait for somebody from glibc or fsf to confirm that
this is ok to commit this way.
  

Patch

diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index b060eee97d..b983489491 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -53,12 +53,11 @@ 
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (MEMCHR)
 	PTR_ARG (0)
@@ -67,12 +66,9 @@  ENTRY (MEMCHR)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -111,8 +107,7 @@  L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, srcin, cntin
 	sub	cntrem, tmp, src
diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
index e0efbad91c..5179320720 100644
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@@ -37,7 +37,6 @@ 
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -45,18 +44,16 @@ 
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr)
 	PTR_ARG (0)
@@ -67,12 +64,9 @@  ENTRY (__memrchr)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -109,8 +103,7 @@  L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index 442726fd49..ee154ab74b 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -33,38 +33,32 @@ 
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -83,8 +77,7 @@  L(loop):
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
index da53170ece..78d27b4aa6 100644
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@@ -40,7 +40,6 @@ 
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@@ -50,9 +49,8 @@ 
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
@@ -63,34 +61,29 @@ 
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
@@ -162,8 +155,7 @@  L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index a2310871c2..3a5d088407 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -34,35 +34,29 @@ 
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRLEN)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -80,8 +74,7 @@  L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 0dbecb0ce9..282bddc9aa 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -33,39 +33,33 @@ 
 #define src		x2
 #define synd		x3
 #define	shift		x4
-#define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /*
    Core algorithm:
 
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting trailing zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strnlen)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src], 16
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -103,8 +97,7 @@  L(loop32_2):
 	cbz	synd, L(loop32)
 
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	sub	src, src, 16
 	mov	synd, vend.d[0]
 	sub	result, src, srcin