[v12,18/31] arm: Add string-fza.h

Message ID 20230202181149.2181553-19-adhemerval.zanella@linaro.org
State Committed
Headers
Series Improve generic string routines |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Adhemerval Zanella Netto Feb. 2, 2023, 6:11 p.m. UTC
  From: Richard Henderson <richard.henderson@linaro.org>

While arm has the more important string functions in assembly,
there are still a few generic routines used.

Use the UQSUB8 insn for testing of zeros.

Checked on armv7-linux-gnueabihf
Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
---
 sysdeps/arm/armv6t2/string-fza.h | 68 ++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 sysdeps/arm/armv6t2/string-fza.h
  

Comments

Szabolcs Nagy Feb. 20, 2023, 1:24 p.m. UTC | #1
The 02/02/2023 15:11, Adhemerval Zanella via Libc-alpha wrote:
> From: Richard Henderson <richard.henderson@linaro.org>
> 
> While arm has the more important string functions in assembly,
> there are still a few generic routines used.
> 
> Use the UQSUB8 insn for testing of zeros.
> 
> Checked on armv7-linux-gnueabihf
> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
> ---
>  sysdeps/arm/armv6t2/string-fza.h | 68 ++++++++++++++++++++++++++++++++
...
> +static __always_inline find_t
> +find_zero_all (op_t x)
> +{
> +  /* Use unsigned saturated subtraction from 1 in each byte.
> +     That leaves 1 for every byte that was zero.  */
> +  op_t ones = repeat_bytes (0x01);
> +  return __builtin_arm_uqsub8 (ones, x);
> +}

__builtin_arm_uqsub8 is only available >=gcc-10

so now the build fails with gcc-9

../sysdeps/arm/armv6t2/string-fza.h:36:10: error: implicit declaration of function ‘__builtin_arm_uqsub8’; did you mean ‘__builtin_arm_stc’? [-Werror=implicit-function-declaration]
   36 |   return __builtin_arm_uqsub8 (ones, x);

so this code should be conditional on gcc version.
  
Szabolcs Nagy Feb. 20, 2023, 1:45 p.m. UTC | #2
The 02/20/2023 13:24, Szabolcs Nagy via Libc-alpha wrote:
> The 02/02/2023 15:11, Adhemerval Zanella via Libc-alpha wrote:
> > +static __always_inline find_t
> > +find_zero_all (op_t x)
> > +{
> > +  /* Use unsigned saturated subtraction from 1 in each byte.
> > +     That leaves 1 for every byte that was zero.  */
> > +  op_t ones = repeat_bytes (0x01);
> > +  return __builtin_arm_uqsub8 (ones, x);
> > +}
> 
> __builtin_arm_uqsub8 is only available >=gcc-10
> 
> so now the build fails with gcc-9
> 
> ../sysdeps/arm/armv6t2/string-fza.h:36:10: error: implicit declaration of function ‘__builtin_arm_uqsub8’; did you mean ‘__builtin_arm_stc’? [-Werror=implicit-function-declaration]
>    36 |   return __builtin_arm_uqsub8 (ones, x);
> 
> so this code should be conditional on gcc version.
> 

i think

 asm ("uqsub8  %0, %0, %1" : "+r" (ones) : "r" (x));

should be a good fallback (untested).
  
Adhemerval Zanella Netto Feb. 20, 2023, 2:01 p.m. UTC | #3
On 20/02/23 10:45, Szabolcs Nagy wrote:
> The 02/20/2023 13:24, Szabolcs Nagy via Libc-alpha wrote:
>> The 02/02/2023 15:11, Adhemerval Zanella via Libc-alpha wrote:
>>> +static __always_inline find_t
>>> +find_zero_all (op_t x)
>>> +{
>>> +  /* Use unsigned saturated subtraction from 1 in each byte.
>>> +     That leaves 1 for every byte that was zero.  */
>>> +  op_t ones = repeat_bytes (0x01);
>>> +  return __builtin_arm_uqsub8 (ones, x);
>>> +}
>>
>> __builtin_arm_uqsub8 is only available >=gcc-10
>>
>> so now the build fails with gcc-9
>>
>> ../sysdeps/arm/armv6t2/string-fza.h:36:10: error: implicit declaration of function ‘__builtin_arm_uqsub8’; did you mean ‘__builtin_arm_stc’? [-Werror=implicit-function-declaration]
>>    36 |   return __builtin_arm_uqsub8 (ones, x);
>>
>> so this code should be conditional on gcc version.
>>
> 
> i think
> 
>  asm ("uqsub8  %0, %0, %1" : "+r" (ones) : "r" (x));
> 
> should be a good fallback (untested).

This is what we have on v7:

static __always_inline op_t
find_zero_all (op_t x)
{
  /* Use unsigned saturated subtraction from 1 in each byte.
     That leaves 1 for every byte that was zero.  */
  op_t ret, ones = repeat_bytes (0x01);
  asm ("uqsub8 %0,%1,%2" : "=r"(ret) : "r"(ones), "r"(x));
  return ret;
}

Maybe extend with:

static __always_inline op_t
find_zero_all (op_t x)
{
  op_t ones = repeat_bytes (0x01);
#if __GNUC_PREREQ (10, 0)
  return __builtin_arm_uqsub8 (ones, x);
#else
  op_t ret;
  asm ("uqsub8 %0,%1,%2" : "=r"(ret) : "r"(ones), "r"(x));
  return ret;
#endif
}
  
Szabolcs Nagy Feb. 20, 2023, 4:12 p.m. UTC | #4
The 02/20/2023 11:01, Adhemerval Zanella Netto wrote:
> 
> 
> On 20/02/23 10:45, Szabolcs Nagy wrote:
> > The 02/20/2023 13:24, Szabolcs Nagy via Libc-alpha wrote:
> >> The 02/02/2023 15:11, Adhemerval Zanella via Libc-alpha wrote:
> >>> +static __always_inline find_t
> >>> +find_zero_all (op_t x)
> >>> +{
> >>> +  /* Use unsigned saturated subtraction from 1 in each byte.
> >>> +     That leaves 1 for every byte that was zero.  */
> >>> +  op_t ones = repeat_bytes (0x01);
> >>> +  return __builtin_arm_uqsub8 (ones, x);
> >>> +}
> >>
> >> __builtin_arm_uqsub8 is only available >=gcc-10
> >>
> >> so now the build fails with gcc-9
> >>
> >> ../sysdeps/arm/armv6t2/string-fza.h:36:10: error: implicit declaration of function ‘__builtin_arm_uqsub8’; did you mean ‘__builtin_arm_stc’? [-Werror=implicit-function-declaration]
> >>    36 |   return __builtin_arm_uqsub8 (ones, x);
> >>
> >> so this code should be conditional on gcc version.
> >>
> > 
> > i think
> > 
> >  asm ("uqsub8  %0, %0, %1" : "+r" (ones) : "r" (x));
> > 
> > should be a good fallback (untested).
> 
> This is what we have on v7:
> 
> static __always_inline op_t
> find_zero_all (op_t x)
> {
>   /* Use unsigned saturated subtraction from 1 in each byte.
>      That leaves 1 for every byte that was zero.  */
>   op_t ret, ones = repeat_bytes (0x01);
>   asm ("uqsub8 %0,%1,%2" : "=r"(ret) : "r"(ones), "r"(x));
>   return ret;
> }
> 
> Maybe extend with:
> 
> static __always_inline op_t
> find_zero_all (op_t x)
> {
>   op_t ones = repeat_bytes (0x01);
> #if __GNUC_PREREQ (10, 0)
>   return __builtin_arm_uqsub8 (ones, x);
> #else
>   op_t ret;
>   asm ("uqsub8 %0,%1,%2" : "=r"(ret) : "r"(ones), "r"(x));
>   return ret;
> #endif
> }

yes this looks good to me.
  

Patch

diff --git a/sysdeps/arm/armv6t2/string-fza.h b/sysdeps/arm/armv6t2/string-fza.h
new file mode 100644
index 0000000000..50afaba038
--- /dev/null
+++ b/sysdeps/arm/armv6t2/string-fza.h
@@ -0,0 +1,68 @@ 
+/* Zero byte detection; basics.  ARM version.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _STRING_FZA_H
+#define _STRING_FZA_H 1
+
+#include <string-misc.h>
+#include <string-optype.h>
+
+/* The functions return a byte mask.  */
+typedef op_t find_t;
+
+/* This function returns at least one bit set within every byte
+   of X that is zero.  */
+static __always_inline find_t
+find_zero_all (op_t x)
+{
+  /* Use unsigned saturated subtraction from 1 in each byte.
+     That leaves 1 for every byte that was zero.  */
+  op_t ones = repeat_bytes (0x01);
+  return __builtin_arm_uqsub8 (ones, x);
+}
+
+/* Identify bytes that are equal between X1 and X2.  */
+static __always_inline find_t
+find_eq_all (op_t x1, op_t x2)
+{
+  return find_zero_all (x1 ^ x2);
+}
+
+/* Identify zero bytes in X1 or equality between X1 and X2.  */
+static __always_inline find_t
+find_zero_eq_all (op_t x1, op_t x2)
+{
+  return find_zero_all (x1) | find_zero_all (x1 ^ x2);
+}
+
+/* Identify zero bytes in X1 or inequality between X1 and X2.  */
+static __always_inline find_t
+find_zero_ne_all (op_t x1, op_t x2)
+{
+  /* Make use of the fact that we'll already have ONES in a register.  */
+  op_t ones = repeat_bytes (0x01);
+  return find_zero_all (x1) | (find_zero_all (x1 ^ x2) ^ ones);
+}
+
+/* Define the "inexact" versions in terms of the exact versions.  */
+#define find_zero_low		find_zero_all
+#define find_eq_low		find_eq_all
+#define find_zero_eq_low	find_zero_eq_all
+#define find_zero_ne_low	find_zero_ne_all
+
+#endif /* _STRING_FZA_H */