aarch64: Optimize string functions with shrn instruction
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
dj/TryBot-32bit |
success
|
Build for i686
|
Commit Message
From: Danila Kutenin <kutdanila@yandex.ru>
We found that string functions were using AND+ADDP
to find the nibble/syndrome mask but there is an easier
opportunity through `SHRN dst, src, 4` and has same
latency on all SIMD ARMv8 targets as ADDP. There are also
gaps for memcmp but that's probably for another patch
We see 10-20% savings for small-mid size cases which are
primary cases for general workloads https://pastebin.com/hA5Fd8eM
I don't have commit rights, asking maintainers to do that
Signed-off-by: Danila Kutenin <danilak@google.com>
---
sysdeps/aarch64/memchr.S | 19 +++++++------------
sysdeps/aarch64/memrchr.S | 25 +++++++++----------------
sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
sysdeps/aarch64/strcpy.S | 32 ++++++++++++--------------------
sysdeps/aarch64/strlen.S | 25 +++++++++----------------
sysdeps/aarch64/strnlen.S | 25 +++++++++----------------
6 files changed, 57 insertions(+), 98 deletions(-)
Comments
The 06/20/2022 17:46, Danila Kutenin wrote:
> From: Danila Kutenin <kutdanila@yandex.ru>
>
> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst, src, 4` and has same
> latency on all SIMD ARMv8 targets as ADDP. There are also
> gaps for memcmp but that's probably for another patch
>
> We see 10-20% savings for small-mid size cases which are
> primary cases for general workloads https://pastebin.com/hA5Fd8eM
>
> I don't have commit rights, asking maintainers to do that
>
> Signed-off-by: Danila Kutenin <danilak@google.com>
is this a contribution from google or yandex or personal?
(e.g. if your company has copyright assignment with fsf then
you dont need signed-off-by, otherwise it's better to have
the email address consistent with the author address)
> ---
> sysdeps/aarch64/memchr.S | 19 +++++++------------
> sysdeps/aarch64/memrchr.S | 25 +++++++++----------------
> sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> sysdeps/aarch64/strcpy.S | 32 ++++++++++++--------------------
> sysdeps/aarch64/strlen.S | 25 +++++++++----------------
> sysdeps/aarch64/strnlen.S | 25 +++++++++----------------
> 6 files changed, 57 insertions(+), 98 deletions(-)
>
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..b983489491 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -53,12 +53,11 @@
>
> /*
> Core algorithm:
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since the
> - bits in the syndrome reflect the order in which things occur in the original
> - string, counting trailing zeros identifies exactly which byte matched. */
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> + per byte. We take 4 bits of every comparison byte with shift right and narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order in
> + which things occur in the original string, counting leading zeros identifies
> + exactly which byte matched. */
>
> ENTRY (MEMCHR)
> PTR_ARG (0)
> @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> cbz cntin, L(nomatch)
> ld1 {vdata.16b}, [src]
> dup vrepchr.16b, chrin
> - mov wtmp, 0xf00f
> - dup vrepmask.8h, wtmp
> cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> lsl shift, srcin, 2
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
> lsr synd, synd, shift
> cbz synd, L(start_loop)
> @@ -111,8 +107,7 @@ L(loop32_2):
> fmov synd, dend
> cbz synd, L(loop32)
> L(end):
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
> add tmp, srcin, cntin
> sub cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
> #define synd x5
> #define shift x6
> #define tmp x7
> -#define wtmp w7
> #define end x8
> #define endm1 x9
>
> @@ -45,18 +44,16 @@
> #define qdata q1
> #define vdata v1
> #define vhas_chr v2
> -#define vrepmask v3
> -#define vend v4
> -#define dend d4
> +#define vend v3
> +#define dend d3
>
> /*
> Core algorithm:
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since the
> - bits in the syndrome reflect the order in which things occur in the original
> - string, counting trailing zeros identifies exactly which byte matched. */
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> + per byte. We take 4 bits of every comparison byte with shift right and narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order in
> + which things occur in the original string, counting leading zeros identifies
> + exactly which byte matched. */
>
> ENTRY (__memrchr)
> PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> cbz cntin, L(nomatch)
> ld1 {vdata.16b}, [src]
> dup vrepchr.16b, chrin
> - mov wtmp, 0xf00f
> - dup vrepmask.8h, wtmp
> cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> neg shift, end, lsl 2
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
> lsl synd, synd, shift
> cbz synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
> fmov synd, dend
> cbz synd, L(loop32)
> L(end):
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
>
> add tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
> #define src x2
> #define tmp1 x1
> #define tmp2 x3
> -#define tmp2w w3
>
> #define vrepchr v0
> #define vdata v1
> #define qdata q1
> #define vhas_nul v2
> #define vhas_chr v3
> -#define vrepmask v4
> -#define vend v5
> -#define dend d5
> +#define vend v4
> +#define dend d4
>
> -/* Core algorithm:
> -
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since the
> - bits in the syndrome reflect the order in which things occur in the original
> - string, counting trailing zeros identifies exactly which byte matched. */
> +/*
> + Core algorithm:
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> + per byte. We take 4 bits of every comparison byte with shift right and narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order in
> + which things occur in the original string, counting leading zeros identifies
> + exactly which byte matched. */
>
> ENTRY (__strchrnul)
> PTR_ARG (0)
> bic src, srcin, 15
> dup vrepchr.16b, chrin
> ld1 {vdata.16b}, [src]
> - mov tmp2w, 0xf00f
> - dup vrepmask.8h, tmp2w
> cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
> lsl tmp2, srcin, 2
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov tmp1, dend
> lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
> cbz tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
> fmov tmp1, dend
> cbz tmp1, L(loop)
>
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov tmp1, dend
> #ifndef __AARCH64EB__
> rbit tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
> #define len x4
> #define synd x4
> #define tmp x5
> -#define wtmp w5
> #define shift x5
> #define data1 x6
> #define dataw1 w6
> @@ -50,9 +49,8 @@
> #define dataq q0
> #define vdata v0
> #define vhas_nul v1
> -#define vrepmask v2
> -#define vend v3
> -#define dend d3
> +#define vend v2
> +#define dend d2
> #define dataq2 q1
>
> #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
> # define IFSTPCPY(X,...)
> #endif
>
> -/* Core algorithm:
> -
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since the
> - bits in the syndrome reflect the order in which things occur in the original
> - string, counting trailing zeros identifies exactly which byte matched. */
> +/*
> + Core algorithm:
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> + per byte. We take 4 bits of every comparison byte with shift right and narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order in
> + which things occur in the original string, counting leading zeros identifies
> + exactly which byte matched. */
>
> ENTRY (STRCPY)
> PTR_ARG (0)
> PTR_ARG (1)
> bic src, srcin, 15
> - mov wtmp, 0xf00f
> ld1 {vdata.16b}, [src]
> - dup vrepmask.8h, wtmp
> cmeq vhas_nul.16b, vdata.16b, 0
> lsl shift, srcin, 2
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> lsr synd, synd, shift
> cbnz synd, L(tail)
>
> ldr dataq, [src, 16]!
> cmeq vhas_nul.16b, vdata.16b, 0
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> cbz synd, L(start_loop)
>
> @@ -162,8 +155,7 @@ L(loop):
> fmov synd, dend
> cbz synd, L(loop)
>
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> #ifndef __AARCH64EB__
> rbit synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
> #define src x1
> #define synd x2
> #define tmp x3
> -#define wtmp w3
> #define shift x4
>
> #define data q0
> #define vdata v0
> #define vhas_nul v1
> -#define vrepmask v2
> -#define vend v3
> -#define dend d3
> +#define vend v2
> +#define dend d2
>
> /* Core algorithm:
>
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since the
> - bits in the syndrome reflect the order in which things occur in the original
> - string, counting trailing zeros identifies exactly which byte matched. */
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> + per byte. We take 4 bits of every comparison byte with shift right and narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order in
> + which things occur in the original string, counting trailing zeros identifies
> + exactly which byte matched. */
>
> ENTRY (STRLEN)
> PTR_ARG (0)
> bic src, srcin, 15
> - mov wtmp, 0xf00f
> ld1 {vdata.16b}, [src]
> - dup vrepmask.8h, wtmp
> cmeq vhas_nul.16b, vdata.16b, 0
> lsl shift, srcin, 2
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> lsr synd, synd, shift
> cbz synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
> fmov synd, dend
> cbz synd, L(loop)
>
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> sub result, src, srcin
> fmov synd, dend
> #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
> #define src x2
> #define synd x3
> #define shift x4
> -#define wtmp w4
> #define tmp x4
> #define cntrem x5
>
> #define qdata q0
> #define vdata v0
> #define vhas_chr v1
> -#define vrepmask v2
> -#define vend v3
> -#define dend d3
> +#define vend v2
> +#define dend d2
>
> /*
> Core algorithm:
>
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since the
> - bits in the syndrome reflect the order in which things occur in the original
> - string, counting trailing zeros identifies exactly which byte matched. */
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
> + per byte. We take 4 bits of every comparison byte with shift right and narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order in
> + which things occur in the original string, counting trailing zeros identifies
> + exactly which byte matched. */
>
> ENTRY (__strnlen)
> PTR_ARG (0)
> SIZE_ARG (1)
> bic src, srcin, 15
> - mov wtmp, 0xf00f
> cbz cntin, L(nomatch)
> ld1 {vdata.16b}, [src], 16
> - dup vrepmask.8h, wtmp
> cmeq vhas_chr.16b, vdata.16b, 0
> lsl shift, srcin, 2
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
> lsr synd, synd, shift
> cbz synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
> cbz synd, L(loop32)
>
> L(end):
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> sub src, src, 16
> mov synd, vend.d[0]
> sub result, src, srcin
> --
> 2.37.0.rc0.104.g0611611a94-goog
>
It's a contribution from Google. Google has a copyright assignment with
fsf, I think this should cover it.
Sorry for the email confusion, I realized the mess quite late: my git
client was configured with yandex and the commit was set up with the
google's account. Yandex has nothing to do with this work. If needed, I can
recreate the patch
On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:
> The 06/20/2022 17:46, Danila Kutenin wrote:
> > From: Danila Kutenin <kutdanila@yandex.ru>
> >
> > We found that string functions were using AND+ADDP
> > to find the nibble/syndrome mask but there is an easier
> > opportunity through `SHRN dst, src, 4` and has same
> > latency on all SIMD ARMv8 targets as ADDP. There are also
> > gaps for memcmp but that's probably for another patch
> >
> > We see 10-20% savings for small-mid size cases which are
> > primary cases for general workloads https://pastebin.com/hA5Fd8eM
> >
> > I don't have commit rights, asking maintainers to do that
> >
> > Signed-off-by: Danila Kutenin <danilak@google.com>
>
> is this a contribution from google or yandex or personal?
>
> (e.g. if your company has copyright assignment with fsf then
> you dont need signed-off-by, otherwise it's better to have
> the email address consistent with the author address)
>
> > ---
> > sysdeps/aarch64/memchr.S | 19 +++++++------------
> > sysdeps/aarch64/memrchr.S | 25 +++++++++----------------
> > sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> > sysdeps/aarch64/strcpy.S | 32 ++++++++++++--------------------
> > sysdeps/aarch64/strlen.S | 25 +++++++++----------------
> > sysdeps/aarch64/strnlen.S | 25 +++++++++----------------
> > 6 files changed, 57 insertions(+), 98 deletions(-)
> >
> > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > index b060eee97d..b983489491 100644
> > --- a/sysdeps/aarch64/memchr.S
> > +++ b/sysdeps/aarch64/memchr.S
> > @@ -53,12 +53,11 @@
> >
> > /*
> > Core algorithm:
> > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > - set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > - bits in the syndrome reflect the order in which things occur in the
> original
> > - string, counting trailing zeros identifies exactly which byte
> matched. */
> > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > + per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > + by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > + which things occur in the original string, counting leading zeros
> identifies
> > + exactly which byte matched. */
> >
> > ENTRY (MEMCHR)
> > PTR_ARG (0)
> > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> > cbz cntin, L(nomatch)
> > ld1 {vdata.16b}, [src]
> > dup vrepchr.16b, chrin
> > - mov wtmp, 0xf00f
> > - dup vrepmask.8h, wtmp
> > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > lsl shift, srcin, 2
> > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > fmov synd, dend
> > lsr synd, synd, shift
> > cbz synd, L(start_loop)
> > @@ -111,8 +107,7 @@ L(loop32_2):
> > fmov synd, dend
> > cbz synd, L(loop32)
> > L(end):
> > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > fmov synd, dend
> > add tmp, srcin, cntin
> > sub cntrem, tmp, src
> > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> > --- a/sysdeps/aarch64/memrchr.S
> > +++ b/sysdeps/aarch64/memrchr.S
> > @@ -37,7 +37,6 @@
> > #define synd x5
> > #define shift x6
> > #define tmp x7
> > -#define wtmp w7
> > #define end x8
> > #define endm1 x9
> >
> > @@ -45,18 +44,16 @@
> > #define qdata q1
> > #define vdata v1
> > #define vhas_chr v2
> > -#define vrepmask v3
> > -#define vend v4
> > -#define dend d4
> > +#define vend v3
> > +#define dend d3
> >
> > /*
> > Core algorithm:
> > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > - set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > - bits in the syndrome reflect the order in which things occur in the
> original
> > - string, counting trailing zeros identifies exactly which byte
> matched. */
> > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > + per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > + by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > + which things occur in the original string, counting leading zeros
> identifies
> > + exactly which byte matched. */
> >
> > ENTRY (__memrchr)
> > PTR_ARG (0)
> > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> > cbz cntin, L(nomatch)
> > ld1 {vdata.16b}, [src]
> > dup vrepchr.16b, chrin
> > - mov wtmp, 0xf00f
> > - dup vrepmask.8h, wtmp
> > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > neg shift, end, lsl 2
> > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > fmov synd, dend
> > lsl synd, synd, shift
> > cbz synd, L(start_loop)
> > @@ -109,8 +103,7 @@ L(loop32_2):
> > fmov synd, dend
> > cbz synd, L(loop32)
> > L(end):
> > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > fmov synd, dend
> >
> > add tmp, src, 15
> > diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> > index 442726fd49..ee154ab74b 100644
> > --- a/sysdeps/aarch64/strchrnul.S
> > +++ b/sysdeps/aarch64/strchrnul.S
> > @@ -33,38 +33,32 @@
> > #define src x2
> > #define tmp1 x1
> > #define tmp2 x3
> > -#define tmp2w w3
> >
> > #define vrepchr v0
> > #define vdata v1
> > #define qdata q1
> > #define vhas_nul v2
> > #define vhas_chr v3
> > -#define vrepmask v4
> > -#define vend v5
> > -#define dend d5
> > +#define vend v4
> > +#define dend d4
> >
> > -/* Core algorithm:
> > -
> > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > - set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > - bits in the syndrome reflect the order in which things occur in the
> original
> > - string, counting trailing zeros identifies exactly which byte
> matched. */
> > +/*
> > + Core algorithm:
> > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > + per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > + by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > + which things occur in the original string, counting leading zeros
> identifies
> > + exactly which byte matched. */
> >
> > ENTRY (__strchrnul)
> > PTR_ARG (0)
> > bic src, srcin, 15
> > dup vrepchr.16b, chrin
> > ld1 {vdata.16b}, [src]
> > - mov tmp2w, 0xf00f
> > - dup vrepmask.8h, tmp2w
> > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
> > lsl tmp2, srcin, 2
> > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > fmov tmp1, dend
> > lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
> > cbz tmp1, L(loop)
> > @@ -83,8 +77,7 @@ L(loop):
> > fmov tmp1, dend
> > cbz tmp1, L(loop)
> >
> > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > fmov tmp1, dend
> > #ifndef __AARCH64EB__
> > rbit tmp1, tmp1
> > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > index da53170ece..78d27b4aa6 100644
> > --- a/sysdeps/aarch64/strcpy.S
> > +++ b/sysdeps/aarch64/strcpy.S
> > @@ -40,7 +40,6 @@
> > #define len x4
> > #define synd x4
> > #define tmp x5
> > -#define wtmp w5
> > #define shift x5
> > #define data1 x6
> > #define dataw1 w6
> > @@ -50,9 +49,8 @@
> > #define dataq q0
> > #define vdata v0
> > #define vhas_nul v1
> > -#define vrepmask v2
> > -#define vend v3
> > -#define dend d3
> > +#define vend v2
> > +#define dend d2
> > #define dataq2 q1
> >
> > #ifdef BUILD_STPCPY
> > @@ -63,34 +61,29 @@
> > # define IFSTPCPY(X,...)
> > #endif
> >
> > -/* Core algorithm:
> > -
> > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > - set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > - bits in the syndrome reflect the order in which things occur in the
> original
> > - string, counting trailing zeros identifies exactly which byte
> matched. */
> > +/*
> > + Core algorithm:
> > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > + per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > + by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > + which things occur in the original string, counting leading zeros
> identifies
> > + exactly which byte matched. */
> >
> > ENTRY (STRCPY)
> > PTR_ARG (0)
> > PTR_ARG (1)
> > bic src, srcin, 15
> > - mov wtmp, 0xf00f
> > ld1 {vdata.16b}, [src]
> > - dup vrepmask.8h, wtmp
> > cmeq vhas_nul.16b, vdata.16b, 0
> > lsl shift, srcin, 2
> > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > fmov synd, dend
> > lsr synd, synd, shift
> > cbnz synd, L(tail)
> >
> > ldr dataq, [src, 16]!
> > cmeq vhas_nul.16b, vdata.16b, 0
> > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > fmov synd, dend
> > cbz synd, L(start_loop)
> >
> > @@ -162,8 +155,7 @@ L(loop):
> > fmov synd, dend
> > cbz synd, L(loop)
> >
> > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> */
> > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > fmov synd, dend
> > #ifndef __AARCH64EB__
> > rbit synd, synd
> > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > index a2310871c2..3a5d088407 100644
> > --- a/sysdeps/aarch64/strlen.S
> > +++ b/sysdeps/aarch64/strlen.S
> > @@ -34,35 +34,29 @@
> > #define src x1
> > #define synd x2
> > #define tmp x3
> > -#define wtmp w3
> > #define shift x4
> >
> > #define data q0
> > #define vdata v0
> > #define vhas_nul v1
> > -#define vrepmask v2
> > -#define vend v3
> > -#define dend d3
> > +#define vend v2
> > +#define dend d2
> >
> > /* Core algorithm:
> >
> > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > - set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > - bits in the syndrome reflect the order in which things occur in the
> original
> > - string, counting trailing zeros identifies exactly which byte
> matched. */
> > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > + per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > + by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > + which things occur in the original string, counting trailing zeros
> identifies
> > + exactly which byte matched. */
> >
> > ENTRY (STRLEN)
> > PTR_ARG (0)
> > bic src, srcin, 15
> > - mov wtmp, 0xf00f
> > ld1 {vdata.16b}, [src]
> > - dup vrepmask.8h, wtmp
> > cmeq vhas_nul.16b, vdata.16b, 0
> > lsl shift, srcin, 2
> > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> */
> > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > fmov synd, dend
> > lsr synd, synd, shift
> > cbz synd, L(loop)
> > @@ -80,8 +74,7 @@ L(loop):
> > fmov synd, dend
> > cbz synd, L(loop)
> >
> > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> */
> > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > sub result, src, srcin
> > fmov synd, dend
> > #ifndef __AARCH64EB__
> > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > index 0dbecb0ce9..282bddc9aa 100644
> > --- a/sysdeps/aarch64/strnlen.S
> > +++ b/sysdeps/aarch64/strnlen.S
> > @@ -33,39 +33,33 @@
> > #define src x2
> > #define synd x3
> > #define shift x4
> > -#define wtmp w4
> > #define tmp x4
> > #define cntrem x5
> >
> > #define qdata q0
> > #define vdata v0
> > #define vhas_chr v1
> > -#define vrepmask v2
> > -#define vend v3
> > -#define dend d3
> > +#define vend v2
> > +#define dend d2
> >
> > /*
> > Core algorithm:
> >
> > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> four bits
> > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> > - set likewise for odd bytes so that adjacent bytes can be merged.
> Since the
> > - bits in the syndrome reflect the order in which things occur in the
> original
> > - string, counting trailing zeros identifies exactly which byte
> matched. */
> > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> > + per byte. We take 4 bits of every comparison byte with shift right
> and narrow
> > + by 4 instruction. Since the bits in the nibble mask reflect the
> order in
> > + which things occur in the original string, counting trailing zeros
> identifies
> > + exactly which byte matched. */
> >
> > ENTRY (__strnlen)
> > PTR_ARG (0)
> > SIZE_ARG (1)
> > bic src, srcin, 15
> > - mov wtmp, 0xf00f
> > cbz cntin, L(nomatch)
> > ld1 {vdata.16b}, [src], 16
> > - dup vrepmask.8h, wtmp
> > cmeq vhas_chr.16b, vdata.16b, 0
> > lsl shift, srcin, 2
> > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > fmov synd, dend
> > lsr synd, synd, shift
> > cbz synd, L(start_loop)
> > @@ -103,8 +97,7 @@ L(loop32_2):
> > cbz synd, L(loop32)
> >
> > L(end):
> > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > sub src, src, 16
> > mov synd, vend.d[0]
> > sub result, src, srcin
> > --
> > 2.37.0.rc0.104.g0611611a94-goog
> >
>
The 06/21/2022 10:28, Danila Kutenin wrote:
> It's a contribution from Google. Google has a copyright assignment with
> fsf, I think this should cover it.
note that if you are interested in getting the same improvements
into bionic, musl, llvm-libc, newlib,...
then arm maintains an optimized-routines repo on github for that
purpose and you are welcome to contribute your changes there too.
>
> Sorry for the email confusion, I realized the mess quite late: my git
> client was configured with yandex and the commit was set up with the
> google's account. Yandex has nothing to do with this work. If needed, I can
> recreate the patch
please do, we should not commit pastebin links into the git history.
(just list the measured improvements over the old code in the commit
message, if it's too long then select representative measurements
or aggregate them in some other way.)
>
> On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:
>
> > The 06/20/2022 17:46, Danila Kutenin wrote:
> > > From: Danila Kutenin <kutdanila@yandex.ru>
> > >
> > > We found that string functions were using AND+ADDP
> > > to find the nibble/syndrome mask but there is an easier
> > > opportunity through `SHRN dst, src, 4` and has same
> > > latency on all SIMD ARMv8 targets as ADDP. There are also
> > > gaps for memcmp but that's probably for another patch
> > >
> > > We see 10-20% savings for small-mid size cases which are
> > > primary cases for general workloads https://pastebin.com/hA5Fd8eM
this is good improvement.
we will do some checks (on various cpus).
thanks.
> > >
> > > I don't have commit rights, asking maintainers to do that
> > >
> > > Signed-off-by: Danila Kutenin <danilak@google.com>
> >
> > is this a contribution from google or yandex or personal?
> >
> > (e.g. if your company has copyright assignment with fsf then
> > you dont need signed-off-by, otherwise it's better to have
> > the email address consistent with the author address)
> >
> > > ---
> > > sysdeps/aarch64/memchr.S | 19 +++++++------------
> > > sysdeps/aarch64/memrchr.S | 25 +++++++++----------------
> > > sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> > > sysdeps/aarch64/strcpy.S | 32 ++++++++++++--------------------
> > > sysdeps/aarch64/strlen.S | 25 +++++++++----------------
> > > sysdeps/aarch64/strnlen.S | 25 +++++++++----------------
> > > 6 files changed, 57 insertions(+), 98 deletions(-)
> > >
> > > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > > index b060eee97d..b983489491 100644
> > > --- a/sysdeps/aarch64/memchr.S
> > > +++ b/sysdeps/aarch64/memchr.S
> > > @@ -53,12 +53,11 @@
> > >
> > > /*
> > > Core algorithm:
> > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > - bits in the syndrome reflect the order in which things occur in the
> > original
> > > - string, counting trailing zeros identifies exactly which byte
> > matched. */
> > > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > + per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > + which things occur in the original string, counting leading zeros
> > identifies
> > > + exactly which byte matched. */
> > >
> > > ENTRY (MEMCHR)
> > > PTR_ARG (0)
> > > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> > > cbz cntin, L(nomatch)
> > > ld1 {vdata.16b}, [src]
> > > dup vrepchr.16b, chrin
> > > - mov wtmp, 0xf00f
> > > - dup vrepmask.8h, wtmp
> > > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > > lsl shift, srcin, 2
> > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > > lsr synd, synd, shift
> > > cbz synd, L(start_loop)
> > > @@ -111,8 +107,7 @@ L(loop32_2):
> > > fmov synd, dend
> > > cbz synd, L(loop32)
> > > L(end):
> > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > > add tmp, srcin, cntin
> > > sub cntrem, tmp, src
> > > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > > index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> > > --- a/sysdeps/aarch64/memrchr.S
> > > +++ b/sysdeps/aarch64/memrchr.S
> > > @@ -37,7 +37,6 @@
> > > #define synd x5
> > > #define shift x6
> > > #define tmp x7
> > > -#define wtmp w7
> > > #define end x8
> > > #define endm1 x9
> > >
> > > @@ -45,18 +44,16 @@
> > > #define qdata q1
> > > #define vdata v1
> > > #define vhas_chr v2
> > > -#define vrepmask v3
> > > -#define vend v4
> > > -#define dend d4
> > > +#define vend v3
> > > +#define dend d3
> > >
> > > /*
> > > Core algorithm:
> > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > - bits in the syndrome reflect the order in which things occur in the
> > original
> > > - string, counting trailing zeros identifies exactly which byte
> > matched. */
> > > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > + per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > + which things occur in the original string, counting leading zeros
> > identifies
> > > + exactly which byte matched. */
> > >
> > > ENTRY (__memrchr)
> > > PTR_ARG (0)
> > > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> > > cbz cntin, L(nomatch)
> > > ld1 {vdata.16b}, [src]
> > > dup vrepchr.16b, chrin
> > > - mov wtmp, 0xf00f
> > > - dup vrepmask.8h, wtmp
> > > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > > neg shift, end, lsl 2
> > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > > lsl synd, synd, shift
> > > cbz synd, L(start_loop)
> > > @@ -109,8 +103,7 @@ L(loop32_2):
> > > fmov synd, dend
> > > cbz synd, L(loop32)
> > > L(end):
> > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > >
> > > add tmp, src, 15
> > > diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> > > index 442726fd49..ee154ab74b 100644
> > > --- a/sysdeps/aarch64/strchrnul.S
> > > +++ b/sysdeps/aarch64/strchrnul.S
> > > @@ -33,38 +33,32 @@
> > > #define src x2
> > > #define tmp1 x1
> > > #define tmp2 x3
> > > -#define tmp2w w3
> > >
> > > #define vrepchr v0
> > > #define vdata v1
> > > #define qdata q1
> > > #define vhas_nul v2
> > > #define vhas_chr v3
> > > -#define vrepmask v4
> > > -#define vend v5
> > > -#define dend d5
> > > +#define vend v4
> > > +#define dend d4
> > >
> > > -/* Core algorithm:
> > > -
> > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > - bits in the syndrome reflect the order in which things occur in the
> > original
> > > - string, counting trailing zeros identifies exactly which byte
> > matched. */
> > > +/*
> > > + Core algorithm:
> > > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > + per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > + which things occur in the original string, counting leading zeros
> > identifies
> > > + exactly which byte matched. */
> > >
> > > ENTRY (__strchrnul)
> > > PTR_ARG (0)
> > > bic src, srcin, 15
> > > dup vrepchr.16b, chrin
> > > ld1 {vdata.16b}, [src]
> > > - mov tmp2w, 0xf00f
> > > - dup vrepmask.8h, tmp2w
> > > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > > cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
> > > lsl tmp2, srcin, 2
> > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > fmov tmp1, dend
> > > lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
> > > cbz tmp1, L(loop)
> > > @@ -83,8 +77,7 @@ L(loop):
> > > fmov tmp1, dend
> > > cbz tmp1, L(loop)
> > >
> > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > fmov tmp1, dend
> > > #ifndef __AARCH64EB__
> > > rbit tmp1, tmp1
> > > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > > index da53170ece..78d27b4aa6 100644
> > > --- a/sysdeps/aarch64/strcpy.S
> > > +++ b/sysdeps/aarch64/strcpy.S
> > > @@ -40,7 +40,6 @@
> > > #define len x4
> > > #define synd x4
> > > #define tmp x5
> > > -#define wtmp w5
> > > #define shift x5
> > > #define data1 x6
> > > #define dataw1 w6
> > > @@ -50,9 +49,8 @@
> > > #define dataq q0
> > > #define vdata v0
> > > #define vhas_nul v1
> > > -#define vrepmask v2
> > > -#define vend v3
> > > -#define dend d3
> > > +#define vend v2
> > > +#define dend d2
> > > #define dataq2 q1
> > >
> > > #ifdef BUILD_STPCPY
> > > @@ -63,34 +61,29 @@
> > > # define IFSTPCPY(X,...)
> > > #endif
> > >
> > > -/* Core algorithm:
> > > -
> > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > - bits in the syndrome reflect the order in which things occur in the
> > original
> > > - string, counting trailing zeros identifies exactly which byte
> > matched. */
> > > +/*
> > > + Core algorithm:
> > > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > + per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > + which things occur in the original string, counting leading zeros
> > identifies
> > > + exactly which byte matched. */
> > >
> > > ENTRY (STRCPY)
> > > PTR_ARG (0)
> > > PTR_ARG (1)
> > > bic src, srcin, 15
> > > - mov wtmp, 0xf00f
> > > ld1 {vdata.16b}, [src]
> > > - dup vrepmask.8h, wtmp
> > > cmeq vhas_nul.16b, vdata.16b, 0
> > > lsl shift, srcin, 2
> > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > > lsr synd, synd, shift
> > > cbnz synd, L(tail)
> > >
> > > ldr dataq, [src, 16]!
> > > cmeq vhas_nul.16b, vdata.16b, 0
> > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > > cbz synd, L(start_loop)
> > >
> > > @@ -162,8 +155,7 @@ L(loop):
> > > fmov synd, dend
> > > cbz synd, L(loop)
> > >
> > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > > #ifndef __AARCH64EB__
> > > rbit synd, synd
> > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > > index a2310871c2..3a5d088407 100644
> > > --- a/sysdeps/aarch64/strlen.S
> > > +++ b/sysdeps/aarch64/strlen.S
> > > @@ -34,35 +34,29 @@
> > > #define src x1
> > > #define synd x2
> > > #define tmp x3
> > > -#define wtmp w3
> > > #define shift x4
> > >
> > > #define data q0
> > > #define vdata v0
> > > #define vhas_nul v1
> > > -#define vrepmask v2
> > > -#define vend v3
> > > -#define dend d3
> > > +#define vend v2
> > > +#define dend d2
> > >
> > > /* Core algorithm:
> > >
> > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > - bits in the syndrome reflect the order in which things occur in the
> > original
> > > - string, counting trailing zeros identifies exactly which byte
> > matched. */
> > > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > + per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > + which things occur in the original string, counting trailing zeros
> > identifies
> > > + exactly which byte matched. */
> > >
> > > ENTRY (STRLEN)
> > > PTR_ARG (0)
> > > bic src, srcin, 15
> > > - mov wtmp, 0xf00f
> > > ld1 {vdata.16b}, [src]
> > > - dup vrepmask.8h, wtmp
> > > cmeq vhas_nul.16b, vdata.16b, 0
> > > lsl shift, srcin, 2
> > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > > lsr synd, synd, shift
> > > cbz synd, L(loop)
> > > @@ -80,8 +74,7 @@ L(loop):
> > > fmov synd, dend
> > > cbz synd, L(loop)
> > >
> > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > sub result, src, srcin
> > > fmov synd, dend
> > > #ifndef __AARCH64EB__
> > > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > > index 0dbecb0ce9..282bddc9aa 100644
> > > --- a/sysdeps/aarch64/strnlen.S
> > > +++ b/sysdeps/aarch64/strnlen.S
> > > @@ -33,39 +33,33 @@
> > > #define src x2
> > > #define synd x3
> > > #define shift x4
> > > -#define wtmp w4
> > > #define tmp x4
> > > #define cntrem x5
> > >
> > > #define qdata q0
> > > #define vdata v0
> > > #define vhas_chr v1
> > > -#define vrepmask v2
> > > -#define vend v3
> > > -#define dend d3
> > > +#define vend v2
> > > +#define dend d2
> > >
> > > /*
> > > Core algorithm:
> > >
> > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > four bits
> > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > matched the
> > > - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> > 4-7 are
> > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > Since the
> > > - bits in the syndrome reflect the order in which things occur in the
> > original
> > > - string, counting trailing zeros identifies exactly which byte
> > matched. */
> > > + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> > four bits
> > > + per byte. We take 4 bits of every comparison byte with shift right
> > and narrow
> > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > order in
> > > + which things occur in the original string, counting trailing zeros
> > identifies
> > > + exactly which byte matched. */
> > >
> > > ENTRY (__strnlen)
> > > PTR_ARG (0)
> > > SIZE_ARG (1)
> > > bic src, srcin, 15
> > > - mov wtmp, 0xf00f
> > > cbz cntin, L(nomatch)
> > > ld1 {vdata.16b}, [src], 16
> > > - dup vrepmask.8h, wtmp
> > > cmeq vhas_chr.16b, vdata.16b, 0
> > > lsl shift, srcin, 2
> > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > fmov synd, dend
> > > lsr synd, synd, shift
> > > cbz synd, L(start_loop)
> > > @@ -103,8 +97,7 @@ L(loop32_2):
> > > cbz synd, L(loop32)
> > >
> > > L(end):
> > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> > */
> > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > sub src, src, 16
> > > mov synd, vend.d[0]
> > > sub result, src, srcin
> > > --
> > > 2.37.0.rc0.104.g0611611a94-goog
> > >
> >
>
> note that if you are interested in getting the same improvements
> into bionic, musl, llvm-libc, newlib,...
> then arm maintains an optimized-routines repo on github for that
> purpose and you are welcome to contribute your changes there too.
We have communicated with llvm-libc and bionic, for others will reach out,
thanks!
please do, we should not commit pastebin links into the git history.
> (just list the measured improvements over the old code in the commit
> message, if it's too long then select representative measurements
> or aggregate them in some other way.)
Created a new one, this one should be abandoned. I still was recommended to
write a Sign-off line at the end by my employer
this is good improvement.
> we will do some checks (on various cpus).
Thanks!
On Wed, Jun 22, 2022 at 7:49 AM Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:
> The 06/21/2022 10:28, Danila Kutenin wrote:
> > It's a contribution from Google. Google has a copyright assignment with
> > fsf, I think this should cover it.
>
> note that if you are interested in getting the same improvements
> into bionic, musl, llvm-libc, newlib,...
> then arm maintains an optimized-routines repo on github for that
> purpose and you are welcome to contribute your changes there too.
>
> >
> > Sorry for the email confusion, I realized the mess quite late: my git
> > client was configured with yandex and the commit was set up with the
> > google's account. Yandex has nothing to do with this work. If needed, I
> can
> > recreate the patch
>
> please do, we should not commit pastebin links into the git history.
> (just list the measured improvements over the old code in the commit
> message, if it's too long then select representative measurements
> or aggregate them in some other way.)
>
> >
> > On Tue, Jun 21, 2022, 10:08 Szabolcs Nagy <Szabolcs.Nagy@arm.com> wrote:
> >
> > > The 06/20/2022 17:46, Danila Kutenin wrote:
> > > > From: Danila Kutenin <kutdanila@yandex.ru>
> > > >
> > > > We found that string functions were using AND+ADDP
> > > > to find the nibble/syndrome mask but there is an easier
> > > > opportunity through `SHRN dst, src, 4` and has same
> > > > latency on all SIMD ARMv8 targets as ADDP. There are also
> > > > gaps for memcmp but that's probably for another patch
> > > >
> > > > We see 10-20% savings for small-mid size cases which are
> > > > primary cases for general workloads https://pastebin.com/hA5Fd8eM
>
> this is good improvement.
> we will do some checks (on various cpus).
>
> thanks.
>
>
> > > >
> > > > I don't have commit rights, asking maintainers to do that
> > > >
> > > > Signed-off-by: Danila Kutenin <danilak@google.com>
> > >
> > > is this a contribution from google or yandex or personal?
> > >
> > > (e.g. if your company has copyright assignment with fsf then
> > > you dont need signed-off-by, otherwise it's better to have
> > > the email address consistent with the author address)
> > >
> > > > ---
> > > > sysdeps/aarch64/memchr.S | 19 +++++++------------
> > > > sysdeps/aarch64/memrchr.S | 25 +++++++++----------------
> > > > sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> > > > sysdeps/aarch64/strcpy.S | 32 ++++++++++++--------------------
> > > > sysdeps/aarch64/strlen.S | 25 +++++++++----------------
> > > > sysdeps/aarch64/strnlen.S | 25 +++++++++----------------
> > > > 6 files changed, 57 insertions(+), 98 deletions(-)
> > > >
> > > > diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> > > > index b060eee97d..b983489491 100644
> > > > --- a/sysdeps/aarch64/memchr.S
> > > > +++ b/sysdeps/aarch64/memchr.S
> > > > @@ -53,12 +53,11 @@
> > > >
> > > > /*
> > > > Core algorithm:
> > > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > - requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > - bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > - string, counting trailing zeros identifies exactly which byte
> > > matched. */
> > > > + For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > + per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > + which things occur in the original string, counting leading zeros
> > > identifies
> > > > + exactly which byte matched. */
> > > >
> > > > ENTRY (MEMCHR)
> > > > PTR_ARG (0)
> > > > @@ -67,12 +66,9 @@ ENTRY (MEMCHR)
> > > > cbz cntin, L(nomatch)
> > > > ld1 {vdata.16b}, [src]
> > > > dup vrepchr.16b, chrin
> > > > - mov wtmp, 0xf00f
> > > > - dup vrepmask.8h, wtmp
> > > > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > > > lsl shift, srcin, 2
> > > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > > lsr synd, synd, shift
> > > > cbz synd, L(start_loop)
> > > > @@ -111,8 +107,7 @@ L(loop32_2):
> > > > fmov synd, dend
> > > > cbz synd, L(loop32)
> > > > L(end):
> > > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > > add tmp, srcin, cntin
> > > > sub cntrem, tmp, src
> > > > diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> > > > index e0efbad91c..5179320720 <(517)%20932-0720> <(517)%20932-0720>
> 100644
> > > > --- a/sysdeps/aarch64/memrchr.S
> > > > +++ b/sysdeps/aarch64/memrchr.S
> > > > @@ -37,7 +37,6 @@
> > > > #define synd x5
> > > > #define shift x6
> > > > #define tmp x7
> > > > -#define wtmp w7
> > > > #define end x8
> > > > #define endm1 x9
> > > >
> > > > @@ -45,18 +44,16 @@
> > > > #define qdata q1
> > > > #define vdata v1
> > > > #define vhas_chr v2
> > > > -#define vrepmask v3
> > > > -#define vend v4
> > > > -#define dend d4
> > > > +#define vend v3
> > > > +#define dend d3
> > > >
> > > > /*
> > > > Core algorithm:
> > > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > - requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > - bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > - string, counting trailing zeros identifies exactly which byte
> > > matched. */
> > > > + For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > + per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > + which things occur in the original string, counting leading zeros
> > > identifies
> > > > + exactly which byte matched. */
> > > >
> > > > ENTRY (__memrchr)
> > > > PTR_ARG (0)
> > > > @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> > > > cbz cntin, L(nomatch)
> > > > ld1 {vdata.16b}, [src]
> > > > dup vrepchr.16b, chrin
> > > > - mov wtmp, 0xf00f
> > > > - dup vrepmask.8h, wtmp
> > > > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > > > neg shift, end, lsl 2
> > > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > > lsl synd, synd, shift
> > > > cbz synd, L(start_loop)
> > > > @@ -109,8 +103,7 @@ L(loop32_2):
> > > > fmov synd, dend
> > > > cbz synd, L(loop32)
> > > > L(end):
> > > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > >
> > > > add tmp, src, 15
> > > > diff --git a/sysdeps/aarch64/strchrnul.S
> b/sysdeps/aarch64/strchrnul.S
> > > > index 442726fd49..ee154ab74b 100644
> > > > --- a/sysdeps/aarch64/strchrnul.S
> > > > +++ b/sysdeps/aarch64/strchrnul.S
> > > > @@ -33,38 +33,32 @@
> > > > #define src x2
> > > > #define tmp1 x1
> > > > #define tmp2 x3
> > > > -#define tmp2w w3
> > > >
> > > > #define vrepchr v0
> > > > #define vdata v1
> > > > #define qdata q1
> > > > #define vhas_nul v2
> > > > #define vhas_chr v3
> > > > -#define vrepmask v4
> > > > -#define vend v5
> > > > -#define dend d5
> > > > +#define vend v4
> > > > +#define dend d4
> > > >
> > > > -/* Core algorithm:
> > > > -
> > > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > - requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > - bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > - string, counting trailing zeros identifies exactly which byte
> > > matched. */
> > > > +/*
> > > > + Core algorithm:
> > > > + For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > + per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > + which things occur in the original string, counting leading zeros
> > > identifies
> > > > + exactly which byte matched. */
> > > >
> > > > ENTRY (__strchrnul)
> > > > PTR_ARG (0)
> > > > bic src, srcin, 15
> > > > dup vrepchr.16b, chrin
> > > > ld1 {vdata.16b}, [src]
> > > > - mov tmp2w, 0xf00f
> > > > - dup vrepmask.8h, tmp2w
> > > > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> > > > cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
> > > > lsl tmp2, srcin, 2
> > > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > > fmov tmp1, dend
> > > > lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
> > > > cbz tmp1, L(loop)
> > > > @@ -83,8 +77,7 @@ L(loop):
> > > > fmov tmp1, dend
> > > > cbz tmp1, L(loop)
> > > >
> > > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > > fmov tmp1, dend
> > > > #ifndef __AARCH64EB__
> > > > rbit tmp1, tmp1
> > > > diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> > > > index da53170ece..78d27b4aa6 100644
> > > > --- a/sysdeps/aarch64/strcpy.S
> > > > +++ b/sysdeps/aarch64/strcpy.S
> > > > @@ -40,7 +40,6 @@
> > > > #define len x4
> > > > #define synd x4
> > > > #define tmp x5
> > > > -#define wtmp w5
> > > > #define shift x5
> > > > #define data1 x6
> > > > #define dataw1 w6
> > > > @@ -50,9 +49,8 @@
> > > > #define dataq q0
> > > > #define vdata v0
> > > > #define vhas_nul v1
> > > > -#define vrepmask v2
> > > > -#define vend v3
> > > > -#define dend d3
> > > > +#define vend v2
> > > > +#define dend d2
> > > > #define dataq2 q1
> > > >
> > > > #ifdef BUILD_STPCPY
> > > > @@ -63,34 +61,29 @@
> > > > # define IFSTPCPY(X,...)
> > > > #endif
> > > >
> > > > -/* Core algorithm:
> > > > -
> > > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > - requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > - bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > - string, counting trailing zeros identifies exactly which byte
> > > matched. */
> > > > +/*
> > > > + Core algorithm:
> > > > + For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > + per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > + which things occur in the original string, counting leading zeros
> > > identifies
> > > > + exactly which byte matched. */
> > > >
> > > > ENTRY (STRCPY)
> > > > PTR_ARG (0)
> > > > PTR_ARG (1)
> > > > bic src, srcin, 15
> > > > - mov wtmp, 0xf00f
> > > > ld1 {vdata.16b}, [src]
> > > > - dup vrepmask.8h, wtmp
> > > > cmeq vhas_nul.16b, vdata.16b, 0
> > > > lsl shift, srcin, 2
> > > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> > > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > > lsr synd, synd, shift
> > > > cbnz synd, L(tail)
> > > >
> > > > ldr dataq, [src, 16]!
> > > > cmeq vhas_nul.16b, vdata.16b, 0
> > > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> > > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > > cbz synd, L(start_loop)
> > > >
> > > > @@ -162,8 +155,7 @@ L(loop):
> > > > fmov synd, dend
> > > > cbz synd, L(loop)
> > > >
> > > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > > #ifndef __AARCH64EB__
> > > > rbit synd, synd
> > > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > > > index a2310871c2..3a5d088407 100644
> > > > --- a/sysdeps/aarch64/strlen.S
> > > > +++ b/sysdeps/aarch64/strlen.S
> > > > @@ -34,35 +34,29 @@
> > > > #define src x1
> > > > #define synd x2
> > > > #define tmp x3
> > > > -#define wtmp w3
> > > > #define shift x4
> > > >
> > > > #define data q0
> > > > #define vdata v0
> > > > #define vhas_nul v1
> > > > -#define vrepmask v2
> > > > -#define vend v3
> > > > -#define dend d3
> > > > +#define vend v2
> > > > +#define dend d2
> > > >
> > > > /* Core algorithm:
> > > >
> > > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > - requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > - bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > - string, counting trailing zeros identifies exactly which byte
> > > matched. */
> > > > + For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > + per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > + which things occur in the original string, counting trailing
> zeros
> > > identifies
> > > > + exactly which byte matched. */
> > > >
> > > > ENTRY (STRLEN)
> > > > PTR_ARG (0)
> > > > bic src, srcin, 15
> > > > - mov wtmp, 0xf00f
> > > > ld1 {vdata.16b}, [src]
> > > > - dup vrepmask.8h, wtmp
> > > > cmeq vhas_nul.16b, vdata.16b, 0
> > > > lsl shift, srcin, 2
> > > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > > lsr synd, synd, shift
> > > > cbz synd, L(loop)
> > > > @@ -80,8 +74,7 @@ L(loop):
> > > > fmov synd, dend
> > > > cbz synd, L(loop)
> > > >
> > > > - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> > > > sub result, src, srcin
> > > > fmov synd, dend
> > > > #ifndef __AARCH64EB__
> > > > diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> > > > index 0dbecb0ce9..282bddc9aa 100644
> > > > --- a/sysdeps/aarch64/strnlen.S
> > > > +++ b/sysdeps/aarch64/strnlen.S
> > > > @@ -33,39 +33,33 @@
> > > > #define src x2
> > > > #define synd x3
> > > > #define shift x4
> > > > -#define wtmp w4
> > > > #define tmp x4
> > > > #define cntrem x5
> > > >
> > > > #define qdata q0
> > > > #define vdata v0
> > > > #define vhas_chr v1
> > > > -#define vrepmask v2
> > > > -#define vend v3
> > > > -#define dend d3
> > > > +#define vend v2
> > > > +#define dend d2
> > > >
> > > > /*
> > > > Core algorithm:
> > > >
> > > > - For each 16-byte chunk we calculate a 64-bit syndrome value with
> > > four bits
> > > > - per byte. For even bytes, bits 0-3 are set if the relevant byte
> > > matched the
> > > > - requested character or the byte is NUL. Bits 4-7 must be zero.
> Bits
> > > 4-7 are
> > > > - set likewise for odd bytes so that adjacent bytes can be merged.
> > > Since the
> > > > - bits in the syndrome reflect the order in which things occur in
> the
> > > original
> > > > - string, counting trailing zeros identifies exactly which byte
> > > matched. */
> > > > + For each 16-byte chunk we calculate a 64-bit nibble mask value
> with
> > > four bits
> > > > + per byte. We take 4 bits of every comparison byte with shift
> right
> > > and narrow
> > > > + by 4 instruction. Since the bits in the nibble mask reflect the
> > > order in
> > > > + which things occur in the original string, counting trailing
> zeros
> > > identifies
> > > > + exactly which byte matched. */
> > > >
> > > > ENTRY (__strnlen)
> > > > PTR_ARG (0)
> > > > SIZE_ARG (1)
> > > > bic src, srcin, 15
> > > > - mov wtmp, 0xf00f
> > > > cbz cntin, L(nomatch)
> > > > ld1 {vdata.16b}, [src], 16
> > > > - dup vrepmask.8h, wtmp
> > > > cmeq vhas_chr.16b, vdata.16b, 0
> > > > lsl shift, srcin, 2
> > > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > > fmov synd, dend
> > > > lsr synd, synd, shift
> > > > cbz synd, L(start_loop)
> > > > @@ -103,8 +97,7 @@ L(loop32_2):
> > > > cbz synd, L(loop32)
> > > >
> > > > L(end):
> > > > - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> > > > - addp vend.16b, vhas_chr.16b, vhas_chr.16b /*
> 128->64
> > > */
> > > > + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> > > > sub src, src, 16
> > > > mov synd, vend.d[0]
> > > > sub result, src, srcin
> > > > --
> > > > 2.37.0.rc0.104.g0611611a94-goog
> > > >
> > >
>
The 06/22/2022 08:31, Danila Kutenin wrote:
> >
> > note that if you are interested in getting the same improvements
> > into bionic, musl, llvm-libc, newlib,...
> > then arm maintains an optimized-routines repo on github for that
> > purpose and you are welcome to contribute your changes there too.
>
>
> We have communicated with llvm-libc and bionic, for others will reach out,
> thanks!
note, many projects take code from arm optimized-routines, we
don't have an exact list. e.g. bionic uses it directly, if you
fork that then merging fixes later will be more expensive.
> please do, we should not commit pastebin links into the git history.
> > (just list the measured improvements over the old code in the commit
> > message, if it's too long then select representative measurements
> > or aggregate them in some other way.)
>
>
> Created a new one, this one should be abandoned. I still was recommended to
> write a Sign-off line at the end by my employer
i will wait for somebody from glibc or fsf to confirm that
this is ok to commit this way.
@@ -53,12 +53,11 @@
/*
Core algorithm:
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (MEMCHR)
PTR_ARG (0)
@@ -67,12 +66,9 @@ ENTRY (MEMCHR)
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
- mov wtmp, 0xf00f
- dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
lsl shift, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
@@ -111,8 +107,7 @@ L(loop32_2):
fmov synd, dend
cbz synd, L(loop32)
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
add tmp, srcin, cntin
sub cntrem, tmp, src
@@ -37,7 +37,6 @@
#define synd x5
#define shift x6
#define tmp x7
-#define wtmp w7
#define end x8
#define endm1 x9
@@ -45,18 +44,16 @@
#define qdata q1
#define vdata v1
#define vhas_chr v2
-#define vrepmask v3
-#define vend v4
-#define dend d4
+#define vend v3
+#define dend d3
/*
Core algorithm:
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__memrchr)
PTR_ARG (0)
@@ -67,12 +64,9 @@ ENTRY (__memrchr)
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
- mov wtmp, 0xf00f
- dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
neg shift, end, lsl 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsl synd, synd, shift
cbz synd, L(start_loop)
@@ -109,8 +103,7 @@ L(loop32_2):
fmov synd, dend
cbz synd, L(loop32)
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
add tmp, src, 15
@@ -33,38 +33,32 @@
#define src x2
#define tmp1 x1
#define tmp2 x3
-#define tmp2w w3
#define vrepchr v0
#define vdata v1
#define qdata q1
#define vhas_nul v2
#define vhas_chr v3
-#define vrepmask v4
-#define vend v5
-#define dend d5
+#define vend v4
+#define dend d4
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__strchrnul)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
- mov tmp2w, 0xf00f
- dup vrepmask.8h, tmp2w
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
lsl tmp2, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov tmp1, dend
lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
cbz tmp1, L(loop)
@@ -83,8 +77,7 @@ L(loop):
fmov tmp1, dend
cbz tmp1, L(loop)
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov tmp1, dend
#ifndef __AARCH64EB__
rbit tmp1, tmp1
@@ -40,7 +40,6 @@
#define len x4
#define synd x4
#define tmp x5
-#define wtmp w5
#define shift x5
#define data1 x6
#define dataw1 w6
@@ -50,9 +49,8 @@
#define dataq q0
#define vdata v0
#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
#define dataq2 q1
#ifdef BUILD_STPCPY
@@ -63,34 +61,29 @@
# define IFSTPCPY(X,...)
#endif
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (STRCPY)
PTR_ARG (0)
PTR_ARG (1)
bic src, srcin, 15
- mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbnz synd, L(tail)
ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
cbz synd, L(start_loop)
@@ -162,8 +155,7 @@ L(loop):
fmov synd, dend
cbz synd, L(loop)
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
@@ -34,35 +34,29 @@
#define src x1
#define synd x2
#define tmp x3
-#define wtmp w3
#define shift x4
#define data q0
#define vdata v0
#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
/* Core algorithm:
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting trailing zeros identifies
+ exactly which byte matched. */
ENTRY (STRLEN)
PTR_ARG (0)
bic src, srcin, 15
- mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(loop)
@@ -80,8 +74,7 @@ L(loop):
fmov synd, dend
cbz synd, L(loop)
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
sub result, src, srcin
fmov synd, dend
#ifndef __AARCH64EB__
@@ -33,39 +33,33 @@
#define src x2
#define synd x3
#define shift x4
-#define wtmp w4
#define tmp x4
#define cntrem x5
#define qdata q0
#define vdata v0
#define vhas_chr v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
/*
Core algorithm:
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting trailing zeros identifies
+ exactly which byte matched. */
ENTRY (__strnlen)
PTR_ARG (0)
SIZE_ARG (1)
bic src, srcin, 15
- mov wtmp, 0xf00f
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src], 16
- dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
@@ -103,8 +97,7 @@ L(loop32_2):
cbz synd, L(loop32)
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
sub src, src, 16
mov synd, vend.d[0]
sub result, src, srcin