[RFC,3/3] RISC-V: Implement CPU yielding for busy loops with Zihintpause/Zawrs

Message ID 20240418094635.3502009-4-christoph.muellner@vrull.eu
State Rejected
Headers
Series RISC-V: Use WRS.STO for atomic_spin_nop |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 fail Testing failed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm fail Testing failed

Commit Message

Christoph Müllner April 18, 2024, 9:46 a.m. UTC
  The macro atomic_spin_nop can be used to implement arch-specific
CPU yielding that is used in busy loops (e.g. in pthread_spin_lock).
This patch introduces an ifunc-based implementation for RISC-V,
that uses Zihintpause's PAUSE instruction for that matter (as PAUSE
is a HINT instruction there is not dependency to Zihintpause at
runtime).  Further, we test for Zawrs via hwprobe() and if found
we use WRS.STO instead of PAUSE.

Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
---
 sysdeps/riscv/multiarch/cpu-relax_generic.S   | 31 +++++++++++++++
 sysdeps/riscv/multiarch/cpu-relax_zawrs.S     | 28 +++++++++++++
 .../unix/sysv/linux/riscv/atomic-machine.h    |  3 ++
 .../unix/sysv/linux/riscv/multiarch/Makefile  |  8 ++++
 .../sysv/linux/riscv/multiarch/cpu-relax.c    | 39 +++++++++++++++++++
 .../linux/riscv/multiarch/ifunc-impl-list.c   | 32 +++++++++++++--
 6 files changed, 137 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/riscv/multiarch/cpu-relax_generic.S
 create mode 100644 sysdeps/riscv/multiarch/cpu-relax_zawrs.S
 create mode 100644 sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
  

Comments

Palmer Dabbelt April 18, 2024, 5:17 p.m. UTC | #1
On Thu, 18 Apr 2024 02:46:35 PDT (-0700), christoph.muellner@vrull.eu wrote:
> The macro atomic_spin_nop can be used to implement arch-specific
> CPU yielding that is used in busy loops (e.g. in pthread_spin_lock).
> This patch introduces an ifunc-based implementation for RISC-V,
> that uses Zihintpause's PAUSE instruction for that matter (as PAUSE
> is a HINT instruction there is not dependency to Zihintpause at
> runtime).  Further, we test for Zawrs via hwprobe() and if found
> we use WRS.STO instead of PAUSE.
>
> Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> ---
>  sysdeps/riscv/multiarch/cpu-relax_generic.S   | 31 +++++++++++++++
>  sysdeps/riscv/multiarch/cpu-relax_zawrs.S     | 28 +++++++++++++
>  .../unix/sysv/linux/riscv/atomic-machine.h    |  3 ++
>  .../unix/sysv/linux/riscv/multiarch/Makefile  |  8 ++++
>  .../sysv/linux/riscv/multiarch/cpu-relax.c    | 39 +++++++++++++++++++
>  .../linux/riscv/multiarch/ifunc-impl-list.c   | 32 +++++++++++++--
>  6 files changed, 137 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/riscv/multiarch/cpu-relax_generic.S
>  create mode 100644 sysdeps/riscv/multiarch/cpu-relax_zawrs.S
>  create mode 100644 sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
>
> diff --git a/sysdeps/riscv/multiarch/cpu-relax_generic.S b/sysdeps/riscv/multiarch/cpu-relax_generic.S
> new file mode 100644
> index 0000000000..d3ccfdce84
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/cpu-relax_generic.S
> @@ -0,0 +1,31 @@
> +/* CPU strand yielding for busy loops.  RISC-V version.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +.option push
> +.option arch, +zihintpause
> +ENTRY (__cpu_relax_generic)
> +	/* While we can use the `pause` instruction without
> +	   the need of Zihintpause (because it is a HINT instruction),
> +	   we still have to enable Zihintpause for the assembler.  */
> +	pause
> +	ret
> +END (__cpu_relax_generic)
> +.option pop
> diff --git a/sysdeps/riscv/multiarch/cpu-relax_zawrs.S b/sysdeps/riscv/multiarch/cpu-relax_zawrs.S
> new file mode 100644
> index 0000000000..6d27b354df
> --- /dev/null
> +++ b/sysdeps/riscv/multiarch/cpu-relax_zawrs.S
> @@ -0,0 +1,28 @@
> +/* CPU strand yielding for busy loops.  RISC-V version with Zawrs.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include <sys/asm.h>
> +
> +.option push
> +.option arch, +zawrs
> +ENTRY (__cpu_relax_zawrs)
> +	wrs.sto

This has the same forward progress/eventual success violation as the 
code you sent for GCC and Linux does.  It doesn't really matter if the 
user of the reservation is in a builtin, an asm block, or a function.  
The compiler just doesn't know about those reservation rules and isn't 
going to generate code that follows them.

That said, as far as I can tell the same grey area in the WRS spec that 
I pointed out during one of the Linux reviews still exists.   So if 
there's some hardware that has a more generous set of LR->WRS rules than 
the ISA-required LR->SC rules that's fine, we just need to know what the 
hardware actually is so we can write down the rules and then stick to 
them -- it's just a performance thing for WRS, so having weaker rules 
seems entirely plausible.

I don't know of any hardware with WRS, but sorry if I missed one.  Do 
you have something publicly availiable that behaves this way?

> +	ret
> +END (__cpu_relax_zawrs)
> +.option pop
> diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> index c1c9d949a0..02b9b7a421 100644
> --- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> +++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> @@ -178,4 +178,7 @@
>  # error "ISAs that do not subsume the A extension are not supported"
>  #endif /* !__riscv_atomic */
>
> +extern void __cpu_relax (void);
> +#define atomic_spin_nop() __cpu_relax()
> +
>  #endif /* bits/atomic.h */
> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> index fcef5659d4..0cdf37a38b 100644
> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> @@ -1,3 +1,11 @@
> +# nscd uses atomic_spin_nop which in turn requires cpu_relax
> +ifeq ($(subdir),nscd)
> +sysdep_routines += \
> +  cpu-relax \
> +  cpu-relax_generic \
> +  cpu-relax_zawrs
> +endif
> +
>  ifeq ($(subdir),string)
>  sysdep_routines += \
>    memcpy \
> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c b/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
> new file mode 100644
> index 0000000000..5aeb120e21
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
> @@ -0,0 +1,39 @@
> +/* Multiple versions of cpu-relax.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +# include <ifunc-init.h>
> +# include <riscv-ifunc.h>
> +# include <sys/hwprobe.h>
> +
> +void __cpu_relax (void);
> +extern __typeof (__cpu_relax) __cpu_relax_generic attribute_hidden;
> +extern __typeof (__cpu_relax) __cpu_relax_zawrs attribute_hidden;
> +
> +static inline __typeof (__cpu_relax) *
> +select_cpu_relax_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
> +{
> +  unsigned long long int v;
> +  if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_IMA_EXT_0, &v) == 0
> +      && (v & RISCV_HWPROBE_EXT_ZAWRS))
> +    return __cpu_relax_zawrs;
> +
> +  return __cpu_relax_generic;
> +}
> +
> +riscv_libc_ifunc (__cpu_relax, select_cpu_relax_ifunc);
> diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> index 9f806d7a9e..9c7a8c2e1f 100644
> --- a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> @@ -20,24 +20,48 @@
>  #include <string.h>
>  #include <sys/hwprobe.h>
>
> +#define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0]))
> +
> +void __cpu_relax (void);
> +
>  size_t
>  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  			size_t max)
>  {
>    size_t i = max;
> +  struct riscv_hwprobe pairs[] = {
> +    { .key = RISCV_HWPROBE_KEY_IMA_EXT_0 },
> +    { .key = RISCV_HWPROBE_KEY_CPUPERF_0 },
> +  };
>
>    bool fast_unaligned = false;
> +  bool has_zawrs = false;
> +
> +  if (__riscv_hwprobe (pairs, ARRAY_SIZE (pairs), 0, NULL, 0) == 0)
> +    {
> +      struct riscv_hwprobe *pair;
>
> -  struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
> -  if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
> -      && (pair.value & RISCV_HWPROBE_MISALIGNED_MASK)
> +      /* RISCV_HWPROBE_KEY_IMA_EXT_0  */
> +      pair = &pairs[0];
> +      if (pair->value & RISCV_HWPROBE_EXT_ZAWRS)
> +	has_zawrs = true;
> +
> +      /* RISCV_HWPROBE_KEY_CPUPERF_0  */
> +      pair = &pairs[1];
> +      if ((pair->value & RISCV_HWPROBE_MISALIGNED_MASK)
>            == RISCV_HWPROBE_MISALIGNED_FAST)
> -    fast_unaligned = true;
> +	fast_unaligned = true;
> +    }
>
>    IFUNC_IMPL (i, name, memcpy,
>  	      IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
>  			      __memcpy_noalignment)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
>
> +  IFUNC_IMPL (i, name, __cpu_relax,
> +	      IFUNC_IMPL_ADD (array, i, __cpu_relax, has_zawrs,
> +			      __cpu_relax_zawrs)
> +	      IFUNC_IMPL_ADD (array, i, __cpu_relax, 1, __cpu_relax_generic))
> +
>    return 0;
>  }
  
Vineet Gupta April 18, 2024, 8:03 p.m. UTC | #2
Hi Christoph,

My 2 cents.

On 4/18/24 10:17, Palmer Dabbelt wrote:
> On Thu, 18 Apr 2024 02:46:35 PDT (-0700), christoph.muellner@vrull.eu wrote:
>> The macro atomic_spin_nop can be used to implement arch-specific
>> CPU yielding that is used in busy loops (e.g. in pthread_spin_lock).
>> This patch introduces an ifunc-based implementation for RISC-V,
>> that uses Zihintpause's PAUSE instruction for that matter (as PAUSE
>> is a HINT instruction there is not dependency to Zihintpause at
>> runtime).  Further, we test for Zawrs via hwprobe() and if found
>> we use WRS.STO instead of PAUSE.

[snip]

>> +ENTRY (__cpu_relax_generic)
>> +	/* While we can use the `pause` instruction without
>> +	   the need of Zihintpause (because it is a HINT instruction),
>> +	   we still have to enable Zihintpause for the assembler.  */
>> +	pause
>> +	ret
>> +END (__cpu_relax_generic)

[snip]

>> +.option push
>> +.option arch, +zawrs
>> +ENTRY (__cpu_relax_zawrs)
>> +	wrs.sto
> This has the same forward progress/eventual success violation as the 
> code you sent for GCC and Linux does.  It doesn't really matter if the 
> user of the reservation is in a builtin, an asm block, or a function.  
> The compiler just doesn't know about those reservation rules and isn't 
> going to generate code that follows them.

So, this routine maps to atomic_spin_nop () called in generic code in a
bunch of places.

The easiest case is nptl/pthread_spin_lock.c which looks something like this

__pthread_spin_lock (pthread_spinlock_t *lock)
...
   do
    {
      atomic_spin_nop ();
      val = atomic_load_relaxed (lock);
    }
  while (val != 0);

This works fine for a PAUSE based implementation which doesn't need a
prior reservation, but WRS does and even if both the load and spin are
inlined, reservation happens after WRS which is wrong.

So I think before we can implement this optimization for riscv, we need
a generic glibc change to replace of atomic_spin_nop () with a variant
which also takes the lock/memory under consideration.
The fallback implementation of atomic_load_and_spin_if_cond() will be
what the above loop does. For arches that do implement the API, based on
ISA semantics, they can choose to ignore the mem arg completely (RISC-V
PAUSE based implementation) or implement a little loop which explicitly
takes reservation on the mem/lock and calls WRS.

Does that make sense ?

We just need to sort out the 2nd use of atomic_spin_nop in
nptl/pthread_mutex_lock.c where this conversion might not be straight fwd.
The recent back off back spin looping is getting in the way of obvious
solution, however for discussion sake if we ignore it, the code looks
like this:

PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
...
       int cnt=0, max_cnt = something;
       do
        {
            if (cnt ++>= max_cnt)
            {
                 LLL_MUTEX_LOCK (mutex);
                 break;
            }
            atomic_spin_nop ();
       }
       while (LLL_MUTEX_READ_LOCK (mutex) != 0 || LLL_MUTEX_TRYLOCK
(mutex) != 0);

And this seems like it can be converted to a
atomic_load_and_spin_if_cond(mutex, !=0 ) kind of API.

Let me know if you want me to spin this - pun intended ;-)

-Vineet


> That said, as far as I can tell the same grey area in the WRS spec that 
> I pointed out during one of the Linux reviews still exists.   So if 
> there's some hardware that has a more generous set of LR->WRS rules than 
> the ISA-required LR->SC rules that's fine, we just need to know what the 
> hardware actually is so we can write down the rules and then stick to 
> them -- it's just a performance thing for WRS, so having weaker rules 
> seems entirely plausible.
>
> I don't know of any hardware with WRS, but sorry if I missed one.  Do 
> you have something publicly availiable that behaves this way?
  
Christoph Müllner April 18, 2024, 8:19 p.m. UTC | #3
On Thu, Apr 18, 2024 at 7:17 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Thu, 18 Apr 2024 02:46:35 PDT (-0700), christoph.muellner@vrull.eu wrote:
> > The macro atomic_spin_nop can be used to implement arch-specific
> > CPU yielding that is used in busy loops (e.g. in pthread_spin_lock).
> > This patch introduces an ifunc-based implementation for RISC-V,
> > that uses Zihintpause's PAUSE instruction for that matter (as PAUSE
> > is a HINT instruction there is not dependency to Zihintpause at
> > runtime).  Further, we test for Zawrs via hwprobe() and if found
> > we use WRS.STO instead of PAUSE.
> >
> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>
> > ---
> >  sysdeps/riscv/multiarch/cpu-relax_generic.S   | 31 +++++++++++++++
> >  sysdeps/riscv/multiarch/cpu-relax_zawrs.S     | 28 +++++++++++++
> >  .../unix/sysv/linux/riscv/atomic-machine.h    |  3 ++
> >  .../unix/sysv/linux/riscv/multiarch/Makefile  |  8 ++++
> >  .../sysv/linux/riscv/multiarch/cpu-relax.c    | 39 +++++++++++++++++++
> >  .../linux/riscv/multiarch/ifunc-impl-list.c   | 32 +++++++++++++--
> >  6 files changed, 137 insertions(+), 4 deletions(-)
> >  create mode 100644 sysdeps/riscv/multiarch/cpu-relax_generic.S
> >  create mode 100644 sysdeps/riscv/multiarch/cpu-relax_zawrs.S
> >  create mode 100644 sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
> >
> > diff --git a/sysdeps/riscv/multiarch/cpu-relax_generic.S b/sysdeps/riscv/multiarch/cpu-relax_generic.S
> > new file mode 100644
> > index 0000000000..d3ccfdce84
> > --- /dev/null
> > +++ b/sysdeps/riscv/multiarch/cpu-relax_generic.S
> > @@ -0,0 +1,31 @@
> > +/* CPU strand yielding for busy loops.  RISC-V version.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +#include <sys/asm.h>
> > +
> > +.option push
> > +.option arch, +zihintpause
> > +ENTRY (__cpu_relax_generic)
> > +     /* While we can use the `pause` instruction without
> > +        the need of Zihintpause (because it is a HINT instruction),
> > +        we still have to enable Zihintpause for the assembler.  */
> > +     pause
> > +     ret
> > +END (__cpu_relax_generic)
> > +.option pop
> > diff --git a/sysdeps/riscv/multiarch/cpu-relax_zawrs.S b/sysdeps/riscv/multiarch/cpu-relax_zawrs.S
> > new file mode 100644
> > index 0000000000..6d27b354df
> > --- /dev/null
> > +++ b/sysdeps/riscv/multiarch/cpu-relax_zawrs.S
> > @@ -0,0 +1,28 @@
> > +/* CPU strand yielding for busy loops.  RISC-V version with Zawrs.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +#include <sys/asm.h>
> > +
> > +.option push
> > +.option arch, +zawrs
> > +ENTRY (__cpu_relax_zawrs)
> > +     wrs.sto
>
> This has the same forward progress/eventual success violation as the
> code you sent for GCC and Linux does.  It doesn't really matter if the
> user of the reservation is in a builtin, an asm block, or a function.
> The compiler just doesn't know about those reservation rules and isn't
> going to generate code that follows them.

I see. The main issue is that we don't have a valid reservation when
calling WRS,
so the whole use of Zawrs instructions is pointless.
So the only way to move Zawrs forward would be to adjust the locking routines
(introducing new primitives that have to be implemented for all architectures).


>
> That said, as far as I can tell the same grey area in the WRS spec that
> I pointed out during one of the Linux reviews still exists.   So if
> there's some hardware that has a more generous set of LR->WRS rules than
> the ISA-required LR->SC rules that's fine, we just need to know what the
> hardware actually is so we can write down the rules and then stick to
> them -- it's just a performance thing for WRS, so having weaker rules
> seems entirely plausible.
>
> I don't know of any hardware with WRS, but sorry if I missed one.  Do
> you have something publicly availiable that behaves this way?
>
> > +     ret
> > +END (__cpu_relax_zawrs)
> > +.option pop
> > diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> > index c1c9d949a0..02b9b7a421 100644
> > --- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> > +++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
> > @@ -178,4 +178,7 @@
> >  # error "ISAs that do not subsume the A extension are not supported"
> >  #endif /* !__riscv_atomic */
> >
> > +extern void __cpu_relax (void);
> > +#define atomic_spin_nop() __cpu_relax()
> > +
> >  #endif /* bits/atomic.h */
> > diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> > index fcef5659d4..0cdf37a38b 100644
> > --- a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> > +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
> > @@ -1,3 +1,11 @@
> > +# nscd uses atomic_spin_nop which in turn requires cpu_relax
> > +ifeq ($(subdir),nscd)
> > +sysdep_routines += \
> > +  cpu-relax \
> > +  cpu-relax_generic \
> > +  cpu-relax_zawrs
> > +endif
> > +
> >  ifeq ($(subdir),string)
> >  sysdep_routines += \
> >    memcpy \
> > diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c b/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
> > new file mode 100644
> > index 0000000000..5aeb120e21
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
> > @@ -0,0 +1,39 @@
> > +/* Multiple versions of cpu-relax.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +# include <ifunc-init.h>
> > +# include <riscv-ifunc.h>
> > +# include <sys/hwprobe.h>
> > +
> > +void __cpu_relax (void);
> > +extern __typeof (__cpu_relax) __cpu_relax_generic attribute_hidden;
> > +extern __typeof (__cpu_relax) __cpu_relax_zawrs attribute_hidden;
> > +
> > +static inline __typeof (__cpu_relax) *
> > +select_cpu_relax_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
> > +{
> > +  unsigned long long int v;
> > +  if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_IMA_EXT_0, &v) == 0
> > +      && (v & RISCV_HWPROBE_EXT_ZAWRS))
> > +    return __cpu_relax_zawrs;
> > +
> > +  return __cpu_relax_generic;
> > +}
> > +
> > +riscv_libc_ifunc (__cpu_relax, select_cpu_relax_ifunc);
> > diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> > index 9f806d7a9e..9c7a8c2e1f 100644
> > --- a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
> > @@ -20,24 +20,48 @@
> >  #include <string.h>
> >  #include <sys/hwprobe.h>
> >
> > +#define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0]))
> > +
> > +void __cpu_relax (void);
> > +
> >  size_t
> >  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                       size_t max)
> >  {
> >    size_t i = max;
> > +  struct riscv_hwprobe pairs[] = {
> > +    { .key = RISCV_HWPROBE_KEY_IMA_EXT_0 },
> > +    { .key = RISCV_HWPROBE_KEY_CPUPERF_0 },
> > +  };
> >
> >    bool fast_unaligned = false;
> > +  bool has_zawrs = false;
> > +
> > +  if (__riscv_hwprobe (pairs, ARRAY_SIZE (pairs), 0, NULL, 0) == 0)
> > +    {
> > +      struct riscv_hwprobe *pair;
> >
> > -  struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
> > -  if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
> > -      && (pair.value & RISCV_HWPROBE_MISALIGNED_MASK)
> > +      /* RISCV_HWPROBE_KEY_IMA_EXT_0  */
> > +      pair = &pairs[0];
> > +      if (pair->value & RISCV_HWPROBE_EXT_ZAWRS)
> > +     has_zawrs = true;
> > +
> > +      /* RISCV_HWPROBE_KEY_CPUPERF_0  */
> > +      pair = &pairs[1];
> > +      if ((pair->value & RISCV_HWPROBE_MISALIGNED_MASK)
> >            == RISCV_HWPROBE_MISALIGNED_FAST)
> > -    fast_unaligned = true;
> > +     fast_unaligned = true;
> > +    }
> >
> >    IFUNC_IMPL (i, name, memcpy,
> >             IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
> >                             __memcpy_noalignment)
> >             IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
> >
> > +  IFUNC_IMPL (i, name, __cpu_relax,
> > +           IFUNC_IMPL_ADD (array, i, __cpu_relax, has_zawrs,
> > +                           __cpu_relax_zawrs)
> > +           IFUNC_IMPL_ADD (array, i, __cpu_relax, 1, __cpu_relax_generic))
> > +
> >    return 0;
> >  }
  
Christoph Müllner April 18, 2024, 8:25 p.m. UTC | #4
On Thu, Apr 18, 2024 at 10:03 PM Vineet Gupta <vineetg@rivosinc.com> wrote:
>
> Hi Christoph,
>
> My 2 cents.
>
> On 4/18/24 10:17, Palmer Dabbelt wrote:
> > On Thu, 18 Apr 2024 02:46:35 PDT (-0700), christoph.muellner@vrull.eu wrote:
> >> The macro atomic_spin_nop can be used to implement arch-specific
> >> CPU yielding that is used in busy loops (e.g. in pthread_spin_lock).
> >> This patch introduces an ifunc-based implementation for RISC-V,
> >> that uses Zihintpause's PAUSE instruction for that matter (as PAUSE
> >> is a HINT instruction there is not dependency to Zihintpause at
> >> runtime).  Further, we test for Zawrs via hwprobe() and if found
> >> we use WRS.STO instead of PAUSE.
>
> [snip]
>
> >> +ENTRY (__cpu_relax_generic)
> >> +    /* While we can use the `pause` instruction without
> >> +       the need of Zihintpause (because it is a HINT instruction),
> >> +       we still have to enable Zihintpause for the assembler.  */
> >> +    pause
> >> +    ret
> >> +END (__cpu_relax_generic)
>
> [snip]
>
> >> +.option push
> >> +.option arch, +zawrs
> >> +ENTRY (__cpu_relax_zawrs)
> >> +    wrs.sto
> > This has the same forward progress/eventual success violation as the
> > code you sent for GCC and Linux does.  It doesn't really matter if the
> > user of the reservation is in a builtin, an asm block, or a function.
> > The compiler just doesn't know about those reservation rules and isn't
> > going to generate code that follows them.
>
> So, this routine maps to atomic_spin_nop () called in generic code in a
> bunch of places.
>
> The easiest case is nptl/pthread_spin_lock.c which looks something like this
>
> __pthread_spin_lock (pthread_spinlock_t *lock)
> ...
>    do
>     {
>       atomic_spin_nop ();
>       val = atomic_load_relaxed (lock);
>     }
>   while (val != 0);
>
> This works fine for a PAUSE based implementation which doesn't need a
> prior reservation, but WRS does and even if both the load and spin are
> inlined, reservation happens after WRS which is wrong.
>
> So I think before we can implement this optimization for riscv, we need
> a generic glibc change to replace of atomic_spin_nop () with a variant
> which also takes the lock/memory under consideration.
> The fallback implementation of atomic_load_and_spin_if_cond() will be
> what the above loop does. For arches that do implement the API, based on
> ISA semantics, they can choose to ignore the mem arg completely (RISC-V
> PAUSE based implementation) or implement a little loop which explicitly
> takes reservation on the mem/lock and calls WRS.
>
> Does that make sense ?

I did something similar in the Linux patch a while ago.
Here I wanted to avoid changes in arch-independent code.
But you are right, wrs.sto does not make sense without a
valid reservation.

>
> We just need to sort out the 2nd use of atomic_spin_nop in
> nptl/pthread_mutex_lock.c where this conversion might not be straight fwd.
> The recent back off back spin looping is getting in the way of obvious
> solution, however for discussion sake if we ignore it, the code looks
> like this:
>
> PTHREAD_MUTEX_LOCK (pthread_mutex_t *mutex)
> ...
>        int cnt=0, max_cnt = something;
>        do
>         {
>             if (cnt ++>= max_cnt)
>             {
>                  LLL_MUTEX_LOCK (mutex);
>                  break;
>             }
>             atomic_spin_nop ();
>        }
>        while (LLL_MUTEX_READ_LOCK (mutex) != 0 || LLL_MUTEX_TRYLOCK
> (mutex) != 0);
>
> And this seems like it can be converted to a
> atomic_load_and_spin_if_cond(mutex, !=0 ) kind of API.
>
> Let me know if you want me to spin this - pun intended ;-)
>
> -Vineet
>
>
> > That said, as far as I can tell the same grey area in the WRS spec that
> > I pointed out during one of the Linux reviews still exists.   So if
> > there's some hardware that has a more generous set of LR->WRS rules than
> > the ISA-required LR->SC rules that's fine, we just need to know what the
> > hardware actually is so we can write down the rules and then stick to
> > them -- it's just a performance thing for WRS, so having weaker rules
> > seems entirely plausible.
> >
> > I don't know of any hardware with WRS, but sorry if I missed one.  Do
> > you have something publicly availiable that behaves this way?
  
Vineet Gupta April 18, 2024, 8:36 p.m. UTC | #5
On 4/18/24 13:19, Christoph Müllner wrote:
>> This has the same forward progress/eventual success violation as the
>> code you sent for GCC and Linux does.  It doesn't really matter if the
>> user of the reservation is in a builtin, an asm block, or a function.
>> The compiler just doesn't know about those reservation rules and isn't
>> going to generate code that follows them.
> I see. The main issue is that we don't have a valid reservation when
> calling WRS,
> so the whole use of Zawrs instructions is pointless.
> So the only way to move Zawrs forward would be to adjust the locking routines
> (introducing new primitives that have to be implemented for all architectures).

Not explicitly anyways - the generic fallback will take care of every
arch, except SPARC/x86 which implement atomic_spin_nop wth pause like
semantics, but even they don't need to change at all if we implement new
API atomic_load_and_spin_if_cond_whatever ()  in terms of existing
atomic_spin_nop ()

-Vineet
  
Palmer Dabbelt April 18, 2024, 9:10 p.m. UTC | #6
On Thu, 18 Apr 2024 13:36:32 PDT (-0700), Vineet Gupta wrote:
> On 4/18/24 13:19, Christoph Müllner wrote:
>>> This has the same forward progress/eventual success violation as the
>>> code you sent for GCC and Linux does.  It doesn't really matter if the
>>> user of the reservation is in a builtin, an asm block, or a function.
>>> The compiler just doesn't know about those reservation rules and isn't
>>> going to generate code that follows them.
>> I see. The main issue is that we don't have a valid reservation when
>> calling WRS,
>> so the whole use of Zawrs instructions is pointless.
>> So the only way to move Zawrs forward would be to adjust the locking routines
>> (introducing new primitives that have to be implemented for all architectures).
>
> Not explicitly anyways - the generic fallback will take care of every
> arch, except SPARC/x86 which implement atomic_spin_nop wth pause like
> semantics, but even they don't need to change at all if we implement new
> API atomic_load_and_spin_if_cond_whatever ()  in terms of existing
> atomic_spin_nop ()

Ya, sounds about right.

IIRC I just hooked some of the LLL macros when doing the POC/estimates, 
but it was at least a year ago so I forget exactly how it all fit 
together.  Whatever it is, they end up with basically the same 
load->cond->lr->beq->{wrs,sc} patterns as a bunch of the Linux routines 
should have (sort of a test-and-test-and-set type pattern, or how arm64 
does the load-and-cmpxchg routines).

Adding Charlie and Drew, as we were talking about the Linux side of 
things recently.

> -Vineet
  
Andrew Jones April 19, 2024, 2:09 p.m. UTC | #7
On Thu, Apr 18, 2024 at 02:10:42PM -0700, Palmer Dabbelt wrote:
> On Thu, 18 Apr 2024 13:36:32 PDT (-0700), Vineet Gupta wrote:
> > On 4/18/24 13:19, Christoph Müllner wrote:
> > > > This has the same forward progress/eventual success violation as the
> > > > code you sent for GCC and Linux does.  It doesn't really matter if the
> > > > user of the reservation is in a builtin, an asm block, or a function.
> > > > The compiler just doesn't know about those reservation rules and isn't
> > > > going to generate code that follows them.
> > > I see. The main issue is that we don't have a valid reservation when
> > > calling WRS,
> > > so the whole use of Zawrs instructions is pointless.
> > > So the only way to move Zawrs forward would be to adjust the locking routines
> > > (introducing new primitives that have to be implemented for all architectures).
> > 
> > Not explicitly anyways - the generic fallback will take care of every
> > arch, except SPARC/x86 which implement atomic_spin_nop wth pause like
> > semantics, but even they don't need to change at all if we implement new
> > API atomic_load_and_spin_if_cond_whatever ()  in terms of existing
> > atomic_spin_nop ()
> 
> Ya, sounds about right.
> 
> IIRC I just hooked some of the LLL macros when doing the POC/estimates, but
> it was at least a year ago so I forget exactly how it all fit together.
> Whatever it is, they end up with basically the same
> load->cond->lr->beq->{wrs,sc} patterns as a bunch of the Linux routines
> should have (sort of a test-and-test-and-set type pattern, or how arm64 does
> the load-and-cmpxchg routines).
> 
> Adding Charlie and Drew, as we were talking about the Linux side of things
> recently.

I've just now posted another version [1].

[1] https://lore.kernel.org/all/20240419135321.70781-8-ajones@ventanamicro.com/

Thanks,
drew
  

Patch

diff --git a/sysdeps/riscv/multiarch/cpu-relax_generic.S b/sysdeps/riscv/multiarch/cpu-relax_generic.S
new file mode 100644
index 0000000000..d3ccfdce84
--- /dev/null
+++ b/sysdeps/riscv/multiarch/cpu-relax_generic.S
@@ -0,0 +1,31 @@ 
+/* CPU strand yielding for busy loops.  RISC-V version.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+.option push
+.option arch, +zihintpause
+ENTRY (__cpu_relax_generic)
+	/* While we can use the `pause` instruction without
+	   the need of Zihintpause (because it is a HINT instruction),
+	   we still have to enable Zihintpause for the assembler.  */
+	pause
+	ret
+END (__cpu_relax_generic)
+.option pop
diff --git a/sysdeps/riscv/multiarch/cpu-relax_zawrs.S b/sysdeps/riscv/multiarch/cpu-relax_zawrs.S
new file mode 100644
index 0000000000..6d27b354df
--- /dev/null
+++ b/sysdeps/riscv/multiarch/cpu-relax_zawrs.S
@@ -0,0 +1,28 @@ 
+/* CPU strand yielding for busy loops.  RISC-V version with Zawrs.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+.option push
+.option arch, +zawrs
+ENTRY (__cpu_relax_zawrs)
+	wrs.sto
+	ret
+END (__cpu_relax_zawrs)
+.option pop
diff --git a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
index c1c9d949a0..02b9b7a421 100644
--- a/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
+++ b/sysdeps/unix/sysv/linux/riscv/atomic-machine.h
@@ -178,4 +178,7 @@ 
 # error "ISAs that do not subsume the A extension are not supported"
 #endif /* !__riscv_atomic */
 
+extern void __cpu_relax (void);
+#define atomic_spin_nop() __cpu_relax()
+
 #endif /* bits/atomic.h */
diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
index fcef5659d4..0cdf37a38b 100644
--- a/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
+++ b/sysdeps/unix/sysv/linux/riscv/multiarch/Makefile
@@ -1,3 +1,11 @@ 
+# nscd uses atomic_spin_nop which in turn requires cpu_relax
+ifeq ($(subdir),nscd)
+sysdep_routines += \
+  cpu-relax \
+  cpu-relax_generic \
+  cpu-relax_zawrs
+endif
+
 ifeq ($(subdir),string)
 sysdep_routines += \
   memcpy \
diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c b/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
new file mode 100644
index 0000000000..5aeb120e21
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/riscv/multiarch/cpu-relax.c
@@ -0,0 +1,39 @@ 
+/* Multiple versions of cpu-relax.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+# include <ifunc-init.h>
+# include <riscv-ifunc.h>
+# include <sys/hwprobe.h>
+
+void __cpu_relax (void);
+extern __typeof (__cpu_relax) __cpu_relax_generic attribute_hidden;
+extern __typeof (__cpu_relax) __cpu_relax_zawrs attribute_hidden;
+
+static inline __typeof (__cpu_relax) *
+select_cpu_relax_ifunc (uint64_t dl_hwcap, __riscv_hwprobe_t hwprobe_func)
+{
+  unsigned long long int v;
+  if (__riscv_hwprobe_one (hwprobe_func, RISCV_HWPROBE_KEY_IMA_EXT_0, &v) == 0
+      && (v & RISCV_HWPROBE_EXT_ZAWRS))
+    return __cpu_relax_zawrs;
+
+  return __cpu_relax_generic;
+}
+
+riscv_libc_ifunc (__cpu_relax, select_cpu_relax_ifunc);
diff --git a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
index 9f806d7a9e..9c7a8c2e1f 100644
--- a/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
+++ b/sysdeps/unix/sysv/linux/riscv/multiarch/ifunc-impl-list.c
@@ -20,24 +20,48 @@ 
 #include <string.h>
 #include <sys/hwprobe.h>
 
+#define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0]))
+
+void __cpu_relax (void);
+
 size_t
 __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			size_t max)
 {
   size_t i = max;
+  struct riscv_hwprobe pairs[] = {
+    { .key = RISCV_HWPROBE_KEY_IMA_EXT_0 },
+    { .key = RISCV_HWPROBE_KEY_CPUPERF_0 },
+  };
 
   bool fast_unaligned = false;
+  bool has_zawrs = false;
+
+  if (__riscv_hwprobe (pairs, ARRAY_SIZE (pairs), 0, NULL, 0) == 0)
+    {
+      struct riscv_hwprobe *pair;
 
-  struct riscv_hwprobe pair = { .key = RISCV_HWPROBE_KEY_CPUPERF_0 };
-  if (__riscv_hwprobe (&pair, 1, 0, NULL, 0) == 0
-      && (pair.value & RISCV_HWPROBE_MISALIGNED_MASK)
+      /* RISCV_HWPROBE_KEY_IMA_EXT_0  */
+      pair = &pairs[0];
+      if (pair->value & RISCV_HWPROBE_EXT_ZAWRS)
+	has_zawrs = true;
+
+      /* RISCV_HWPROBE_KEY_CPUPERF_0  */
+      pair = &pairs[1];
+      if ((pair->value & RISCV_HWPROBE_MISALIGNED_MASK)
           == RISCV_HWPROBE_MISALIGNED_FAST)
-    fast_unaligned = true;
+	fast_unaligned = true;
+    }
 
   IFUNC_IMPL (i, name, memcpy,
 	      IFUNC_IMPL_ADD (array, i, memcpy, fast_unaligned,
 			      __memcpy_noalignment)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
 
+  IFUNC_IMPL (i, name, __cpu_relax,
+	      IFUNC_IMPL_ADD (array, i, __cpu_relax, has_zawrs,
+			      __cpu_relax_zawrs)
+	      IFUNC_IMPL_ADD (array, i, __cpu_relax, 1, __cpu_relax_generic))
+
   return 0;
 }