[committed,nvptx] Handle pre-sm_7x shared atomic store using atomic exchange

Message ID 20220210091309.GA20692@delia.home
State Committed
Commit 3e7d4e82dc9fecb051e9ac422c312b26206d5ecd
Headers
Series [committed,nvptx] Handle pre-sm_7x shared atomic store using atomic exchange |

Commit Message

Tom de Vries Feb. 10, 2022, 9:13 a.m. UTC
  Hi,

The ptx isa specifies (for pre-sm_7x) that atomic operations on shared memory
locations do not guarantee atomicity with respect to normal store instructions
to the same address.

This can be fixed by:
- inserting barriers between normal stores and atomic operations to a common
  address
- using atom.exch to store to locations accessed by other atomic operations.

It's not clearly spelled out which barriers are needed, and a barrier seem more
expensive than atomic exchange.

Implement the pre-sm_7x shared atomic store using atomic exchange.

That includes stores using generic addressing, since those may also point to
shared memory.

Tested on x86-64 with nvptx accelerator.

Committed to trunk.

Thanks,
- Tom

[nvptx] Handle pre-sm_7x shared atomic store using atomic exchange

gcc/ChangeLog:

2022-02-02  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx-protos.h (nvptx_mem_maybe_shared_p): Declare.
	* config/nvptx/nvptx.cc (nvptx_mem_data_area): New static function.
	(nvptx_mem_maybe_shared_p): New function.
	* config/nvptx/nvptx.md (define_expand "atomic_store<mode>"): New
	define_expand.

gcc/testsuite/ChangeLog:

2022-02-02  Tom de Vries  <tdevries@suse.de>

	* gcc.target/nvptx/atomic-store-1.c: New test.
	* gcc.target/nvptx/atomic-store-3.c: New test.
	* gcc.target/nvptx/stack-atomics-run.c: Update.

---
 gcc/config/nvptx/nvptx-protos.h                    |  1 +
 gcc/config/nvptx/nvptx.cc                          | 22 ++++++++++++++++
 gcc/config/nvptx/nvptx.md                          | 30 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/nvptx/atomic-store-1.c    | 26 +++++++++++++++++++
 gcc/testsuite/gcc.target/nvptx/atomic-store-3.c    | 25 ++++++++++++++++++
 gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c |  6 ++++-
 6 files changed, 109 insertions(+), 1 deletion(-)
  

Comments

Thomas Schwinge Feb. 15, 2022, 7:34 a.m. UTC | #1
Hi Tom!

For my understanding:

On 2022-02-10T10:13:10+0100, Tom de Vries via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> The ptx isa specifies (for pre-sm_7x) that atomic operations on shared memory
> locations do not guarantee atomicity with respect to normal store instructions
> to the same address.
>
> This can be fixed by:
> - inserting barriers between normal stores and atomic operations to a common
>   address
> - using atom.exch to store to locations accessed by other atomic operations.
>
> It's not clearly spelled out which barriers are needed, and a barrier seem more
> expensive than atomic exchange.
>
> Implement the pre-sm_7x shared atomic store using atomic exchange.
>
> That includes stores using generic addressing, since those may also point to
> shared memory.

It is expected that this changes, for example (similar elsewhere)
'nvptx-none/libatomic/store_4_.o', to use (a) 'atom.exch' (with a new
dummy register allocated) and yet (b) 'membar.sys' remains before as well
as after (a):

     .visible .func __atomic_store_4 (.param .u64 %in_ar0, .param .u32 %in_ar1, .param .u32 %in_ar2)
     {
     .reg .u64 %ar0;
     ld.param.u64 %ar0,[%in_ar0];
     .reg .u32 %ar1;
     ld.param.u32 %ar1,[%in_ar1];
     .reg .u32 %ar2;
     ld.param.u32 %ar2,[%in_ar2];
     .reg .u64 %r22;
     .reg .u32 %r23;
    +.reg .u32 %r25;
     mov.u64 %r22,%ar0;
     mov.u32 %r23,%ar1;
     .loc 2 39 5
     membar.sys;
    -st.u32 [%r22],%r23;
    +atom.exch.b32 %r25,[%r22],%r23;
     membar.sys;
     .loc 2 40 1
     ret;
     }

And, for example (similar elsewhere) 'nvptx-none/libgomp/single.o', a
'ld' that previously was issued after 'membar.sys' now moves before that
one:

     .visible .func (.param .u64 %value_out) GOMP_single_copy_start
     {
    [...]
     .reg .u64 %r33;
     .reg .u64 %r34;
    [...]
     .reg .pred %r51;
     .reg .u64 %r50;
     .reg .u64 %r52;
    [...]
     ld.shared.u64 %r50,[nvptx_thrs];
     add.u64 %r33,%r50,%r49;
     .loc 2 1317 32
     ld.u64 %r34,[%r33+32];
     .loc 2 1317 6
     setp.eq.u64 %r51,%r34,0;
     @ %r51 bra $L6;
     .loc 4 66 3
    -membar.sys;
     ld.u64 %r52,[%r33+24];
    -st.u64 [%r34+80],%r52;
    +membar.sys;
    +atom.exch.b64 %r53,[%r34+80],%r52;
     $L6:
    [...]

(But I see there is another 'ld.u64 %r34,[%r33+32]' that previously also
already was issued before the 'membar.sys'.)


Grüße
 Thomas


> [nvptx] Handle pre-sm_7x shared atomic store using atomic exchange
>
> gcc/ChangeLog:
>
> 2022-02-02  Tom de Vries  <tdevries@suse.de>
>
>       * config/nvptx/nvptx-protos.h (nvptx_mem_maybe_shared_p): Declare.
>       * config/nvptx/nvptx.cc (nvptx_mem_data_area): New static function.
>       (nvptx_mem_maybe_shared_p): New function.
>       * config/nvptx/nvptx.md (define_expand "atomic_store<mode>"): New
>       define_expand.
>
> gcc/testsuite/ChangeLog:
>
> 2022-02-02  Tom de Vries  <tdevries@suse.de>
>
>       * gcc.target/nvptx/atomic-store-1.c: New test.
>       * gcc.target/nvptx/atomic-store-3.c: New test.
>       * gcc.target/nvptx/stack-atomics-run.c: Update.
>
> ---
>  gcc/config/nvptx/nvptx-protos.h                    |  1 +
>  gcc/config/nvptx/nvptx.cc                          | 22 ++++++++++++++++
>  gcc/config/nvptx/nvptx.md                          | 30 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/nvptx/atomic-store-1.c    | 26 +++++++++++++++++++
>  gcc/testsuite/gcc.target/nvptx/atomic-store-3.c    | 25 ++++++++++++++++++
>  gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c |  6 ++++-
>  6 files changed, 109 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
> index a846e341917..0bf9af406a2 100644
> --- a/gcc/config/nvptx/nvptx-protos.h
> +++ b/gcc/config/nvptx/nvptx-protos.h
> @@ -60,5 +60,6 @@ extern const char *nvptx_output_simt_exit (rtx);
>  extern const char *nvptx_output_red_partition (rtx, rtx);
>  extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int);
>  extern bool nvptx_mem_local_p (rtx);
> +extern bool nvptx_mem_maybe_shared_p (const_rtx);
>  #endif
>  #endif
> diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
> index 1b0227a2c31..5b26c0f4c7d 100644
> --- a/gcc/config/nvptx/nvptx.cc
> +++ b/gcc/config/nvptx/nvptx.cc
> @@ -76,6 +76,7 @@
>  #include "intl.h"
>  #include "opts.h"
>  #include "tree-pretty-print.h"
> +#include "rtl-iter.h"
>
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -2787,6 +2788,27 @@ nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
>    nvptx_print_address_operand (file, addr, mode);
>  }
>
> +static nvptx_data_area
> +nvptx_mem_data_area (const_rtx x)
> +{
> +  gcc_assert (GET_CODE (x) == MEM);
> +
> +  const_rtx addr = XEXP (x, 0);
> +  subrtx_iterator::array_type array;
> +  FOR_EACH_SUBRTX (iter, array, addr, ALL)
> +    if (SYMBOL_REF_P (*iter))
> +      return SYMBOL_DATA_AREA (*iter);
> +
> +  return DATA_AREA_GENERIC;
> +}
> +
> +bool
> +nvptx_mem_maybe_shared_p (const_rtx x)
> +{
> +  nvptx_data_area area = nvptx_mem_data_area (x);
> +  return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
> +}
> +
>  /* Print an operand, X, to FILE, with an optional modifier in CODE.
>
>     Meaning of CODE:
> diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
> index cced68e0d4a..1a283b41922 100644
> --- a/gcc/config/nvptx/nvptx.md
> +++ b/gcc/config/nvptx/nvptx.md
> @@ -2051,6 +2051,36 @@ (define_insn "atomic_exchange<mode>"
>    }
>    [(set_attr "atomic" "true")])
>
> +(define_expand "atomic_store<mode>"
> +  [(match_operand:SDIM 0 "memory_operand" "=m")                ;; memory
> +   (match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri")  ;; input
> +   (match_operand:SI 2 "const_int_operand")]           ;; model
> +  ""
> +{
> +  struct address_info info;
> +  decompose_mem_address (&info, operands[0]);
> +  if (info.base != NULL && REG_P (*info.base)
> +      && REGNO_PTR_FRAME_P (REGNO (*info.base)))
> +    {
> +      emit_insn (gen_mov<mode> (operands[0], operands[1]));
> +      DONE;
> +    }
> +
> +  if (TARGET_SM70)
> +    /* Fall back to expand_atomic_store.  */
> +    FAIL;
> +
> +  bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]);
> +  if (!maybe_shared_p)
> +    /* Fall back to expand_atomic_store.  */
> +    FAIL;
> +
> +  rtx tmpreg = gen_reg_rtx (<MODE>mode);
> +  emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1],
> +                                     operands[2]));
> +  DONE;
> +})
> +
>  (define_insn "atomic_fetch_add<mode>"
>    [(set (match_operand:SDIM 1 "memory_operand" "+m")
>       (unspec_volatile:SDIM
> diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c
> new file mode 100644
> index 00000000000..cee3815eda5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c
> @@ -0,0 +1,26 @@
> +/* Test the atomic store expansion for sm <= sm_6x targets,
> +   shared state space.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-misa=sm_53" } */
> +
> +enum memmodel
> +{
> +  MEMMODEL_SEQ_CST = 5
> +};
> +
> +unsigned int u32 __attribute__((shared));
> +unsigned long long int u64 __attribute__((shared));
> +
> +int
> +main()
> +{
> +  __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST);
> +  __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "atom.shared.exch.b32" 1 } } */
> +/* { dg-final { scan-assembler-times "atom.shared.exch.b64" 1 } } */
> +/* { dg-final { scan-assembler-times "membar.cta" 4 } } */
> diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c
> new file mode 100644
> index 00000000000..cc0264f2b06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c
> @@ -0,0 +1,25 @@
> +/* Test the atomic store expansion, global state space.  */
> +
> +/* { dg-do compile } */
> +/* { dg-additional-options "-Wno-long-long" } */
> +
> +enum memmodel
> +{
> +  MEMMODEL_SEQ_CST = 5
> +};
> +
> +unsigned int u32;
> +unsigned long long int u64;
> +
> +int
> +main()
> +{
> +  __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST);
> +  __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "st.global.u32" 1 } } */
> +/* { dg-final { scan-assembler-times "st.global.u64" 1 } } */
> +/* { dg-final { scan-assembler-times "membar.sys" 4 } } */
> diff --git a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
> index ad8e2f842fb..cd045964dfe 100644
> --- a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
> +++ b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
> @@ -39,6 +39,10 @@ main (void)
>    if (b != 1)
>      __builtin_abort ();
>
> -
> +  a = 1;
> +  __atomic_store_n (&a, 0, MEMMODEL_RELAXED);
> +  if (a != 0)
> +    __builtin_abort ();
> +
>    return 0;
>  }
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  
Tom de Vries Feb. 15, 2022, 10:52 a.m. UTC | #2
On 2/15/22 08:34, Thomas Schwinge wrote:
> Hi Tom!
> 
> For my understanding:
> 
> On 2022-02-10T10:13:10+0100, Tom de Vries via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>> The ptx isa specifies (for pre-sm_7x) that atomic operations on shared memory
>> locations do not guarantee atomicity with respect to normal store instructions
>> to the same address.
>>
>> This can be fixed by:
>> - inserting barriers between normal stores and atomic operations to a common
>>    address
>> - using atom.exch to store to locations accessed by other atomic operations.
>>
>> It's not clearly spelled out which barriers are needed, and a barrier seem more
>> expensive than atomic exchange.
>>
>> Implement the pre-sm_7x shared atomic store using atomic exchange.
>>
>> That includes stores using generic addressing, since those may also point to
>> shared memory.
> 
> It is expected that this changes, for example (similar elsewhere)
> 'nvptx-none/libatomic/store_4_.o', to use (a) 'atom.exch' (with a new
> dummy register allocated)

Yes.

We could do slightly better by emitting that as:
...
membar.sys;
{  .reg .u32 dummy;
    atom.exch.b32 dummy,[%r22],%r23;
}
membar.sys;
...
which could improve register pressure.

I just wrote a patch for that (attached, ftr), but using a scratch 
register, and it seems that this similar code:
...
void
foo (U_4 *mptr, U_4 newval)
{
   __atomic_exchange_n (mptr, newval, 5);
}
...
still maps to:
...
.reg .u32 %r24;
membar.sys;
atom.exch.b32 %r24,[%r22],%r23;
membar.sys;
...
so that may not be the right way to do it.

 > and yet (b) 'membar.sys' remains before as well
> as after (a):
> 
>       .visible .func __atomic_store_4 (.param .u64 %in_ar0, .param .u32 %in_ar1, .param .u32 %in_ar2)
>       {
>       .reg .u64 %ar0;
>       ld.param.u64 %ar0,[%in_ar0];
>       .reg .u32 %ar1;
>       ld.param.u32 %ar1,[%in_ar1];
>       .reg .u32 %ar2;
>       ld.param.u32 %ar2,[%in_ar2];
>       .reg .u64 %r22;
>       .reg .u32 %r23;
>      +.reg .u32 %r25;
>       mov.u64 %r22,%ar0;
>       mov.u32 %r23,%ar1;
>       .loc 2 39 5
>       membar.sys;
>      -st.u32 [%r22],%r23;
>      +atom.exch.b32 %r25,[%r22],%r23;
>       membar.sys;
>       .loc 2 40 1
>       ret;
>       }
> 

Yes.  Previously, the barriers where there to guarantee atomicity of the 
store, as well as to implement the memory model MEMMODEL_SEQ_CST.

Now the atomic exchange garantees atomicity of the store and the 
barriers implement the memory model.

In this particular instance (we're using the most severe barrier both 
before and after), we might be able to get away with using a store 
instead of atomic exchange.  But this is a fallback, and we don't care 
so much about performance (besides it's already pretty bad with two 
times membar.sys).

And I rather have atomicity guarantueed by using an atomic insn than by 
using barriers:  there's less room for error by the driver JIT.

> And, for example (similar elsewhere) 'nvptx-none/libgomp/single.o', a
> 'ld' that previously was issued after 'membar.sys' now moves before that
> one:
> 
>       .visible .func (.param .u64 %value_out) GOMP_single_copy_start
>       {
>      [...]
>       .reg .u64 %r33;
>       .reg .u64 %r34;
>      [...]
>       .reg .pred %r51;
>       .reg .u64 %r50;
>       .reg .u64 %r52;
>      [...]
>       ld.shared.u64 %r50,[nvptx_thrs];
>       add.u64 %r33,%r50,%r49;
>       .loc 2 1317 32
>       ld.u64 %r34,[%r33+32];
>       .loc 2 1317 6
>       setp.eq.u64 %r51,%r34,0;
>       @ %r51 bra $L6;
>       .loc 4 66 3
>      -membar.sys;
>       ld.u64 %r52,[%r33+24];
>      -st.u64 [%r34+80],%r52;
>      +membar.sys;
>      +atom.exch.b64 %r53,[%r34+80],%r52;
>       $L6:
>      [...]
> 
> (But I see there is another 'ld.u64 %r34,[%r33+32]' that previously also
> already was issued before the 'membar.sys'.)
> 

Before, an atomic store resulted in two seperate insns, a membar and a 
regular store, and so it could be that they became separated.  F.i., if 
alias analysis could prove that the load did not alias with the store, 
it could move the load inbetween.

Now the memmodel is an operand of the insn, and the membar is only 
implemented when printing the insn, so nothing can come inbetween anymore.

Note btw that we had here this pattern:
...
       membar.sys;
       st.u64 [%r34+80],%r52;
...
and there's no memory barrier after the store to guarantee atomicity 
with respect to atomic operations afterwards.

Thanks,
- Tom

> 
> Grüße
>   Thomas
> 
> 
>> [nvptx] Handle pre-sm_7x shared atomic store using atomic exchange
>>
>> gcc/ChangeLog:
>>
>> 2022-02-02  Tom de Vries  <tdevries@suse.de>
>>
>>        * config/nvptx/nvptx-protos.h (nvptx_mem_maybe_shared_p): Declare.
>>        * config/nvptx/nvptx.cc (nvptx_mem_data_area): New static function.
>>        (nvptx_mem_maybe_shared_p): New function.
>>        * config/nvptx/nvptx.md (define_expand "atomic_store<mode>"): New
>>        define_expand.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2022-02-02  Tom de Vries  <tdevries@suse.de>
>>
>>        * gcc.target/nvptx/atomic-store-1.c: New test.
>>        * gcc.target/nvptx/atomic-store-3.c: New test.
>>        * gcc.target/nvptx/stack-atomics-run.c: Update.
>>
>> ---
>>   gcc/config/nvptx/nvptx-protos.h                    |  1 +
>>   gcc/config/nvptx/nvptx.cc                          | 22 ++++++++++++++++
>>   gcc/config/nvptx/nvptx.md                          | 30 ++++++++++++++++++++++
>>   gcc/testsuite/gcc.target/nvptx/atomic-store-1.c    | 26 +++++++++++++++++++
>>   gcc/testsuite/gcc.target/nvptx/atomic-store-3.c    | 25 ++++++++++++++++++
>>   gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c |  6 ++++-
>>   6 files changed, 109 insertions(+), 1 deletion(-)
>>
>> diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
>> index a846e341917..0bf9af406a2 100644
>> --- a/gcc/config/nvptx/nvptx-protos.h
>> +++ b/gcc/config/nvptx/nvptx-protos.h
>> @@ -60,5 +60,6 @@ extern const char *nvptx_output_simt_exit (rtx);
>>   extern const char *nvptx_output_red_partition (rtx, rtx);
>>   extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int);
>>   extern bool nvptx_mem_local_p (rtx);
>> +extern bool nvptx_mem_maybe_shared_p (const_rtx);
>>   #endif
>>   #endif
>> diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
>> index 1b0227a2c31..5b26c0f4c7d 100644
>> --- a/gcc/config/nvptx/nvptx.cc
>> +++ b/gcc/config/nvptx/nvptx.cc
>> @@ -76,6 +76,7 @@
>>   #include "intl.h"
>>   #include "opts.h"
>>   #include "tree-pretty-print.h"
>> +#include "rtl-iter.h"
>>
>>   /* This file should be included last.  */
>>   #include "target-def.h"
>> @@ -2787,6 +2788,27 @@ nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
>>     nvptx_print_address_operand (file, addr, mode);
>>   }
>>
>> +static nvptx_data_area
>> +nvptx_mem_data_area (const_rtx x)
>> +{
>> +  gcc_assert (GET_CODE (x) == MEM);
>> +
>> +  const_rtx addr = XEXP (x, 0);
>> +  subrtx_iterator::array_type array;
>> +  FOR_EACH_SUBRTX (iter, array, addr, ALL)
>> +    if (SYMBOL_REF_P (*iter))
>> +      return SYMBOL_DATA_AREA (*iter);
>> +
>> +  return DATA_AREA_GENERIC;
>> +}
>> +
>> +bool
>> +nvptx_mem_maybe_shared_p (const_rtx x)
>> +{
>> +  nvptx_data_area area = nvptx_mem_data_area (x);
>> +  return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
>> +}
>> +
>>   /* Print an operand, X, to FILE, with an optional modifier in CODE.
>>
>>      Meaning of CODE:
>> diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
>> index cced68e0d4a..1a283b41922 100644
>> --- a/gcc/config/nvptx/nvptx.md
>> +++ b/gcc/config/nvptx/nvptx.md
>> @@ -2051,6 +2051,36 @@ (define_insn "atomic_exchange<mode>"
>>     }
>>     [(set_attr "atomic" "true")])
>>
>> +(define_expand "atomic_store<mode>"
>> +  [(match_operand:SDIM 0 "memory_operand" "=m")                ;; memory
>> +   (match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri")  ;; input
>> +   (match_operand:SI 2 "const_int_operand")]           ;; model
>> +  ""
>> +{
>> +  struct address_info info;
>> +  decompose_mem_address (&info, operands[0]);
>> +  if (info.base != NULL && REG_P (*info.base)
>> +      && REGNO_PTR_FRAME_P (REGNO (*info.base)))
>> +    {
>> +      emit_insn (gen_mov<mode> (operands[0], operands[1]));
>> +      DONE;
>> +    }
>> +
>> +  if (TARGET_SM70)
>> +    /* Fall back to expand_atomic_store.  */
>> +    FAIL;
>> +
>> +  bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]);
>> +  if (!maybe_shared_p)
>> +    /* Fall back to expand_atomic_store.  */
>> +    FAIL;
>> +
>> +  rtx tmpreg = gen_reg_rtx (<MODE>mode);
>> +  emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1],
>> +                                     operands[2]));
>> +  DONE;
>> +})
>> +
>>   (define_insn "atomic_fetch_add<mode>"
>>     [(set (match_operand:SDIM 1 "memory_operand" "+m")
>>        (unspec_volatile:SDIM
>> diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c
>> new file mode 100644
>> index 00000000000..cee3815eda5
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c
>> @@ -0,0 +1,26 @@
>> +/* Test the atomic store expansion for sm <= sm_6x targets,
>> +   shared state space.  */
>> +
>> +/* { dg-do compile } */
>> +/* { dg-options "-misa=sm_53" } */
>> +
>> +enum memmodel
>> +{
>> +  MEMMODEL_SEQ_CST = 5
>> +};
>> +
>> +unsigned int u32 __attribute__((shared));
>> +unsigned long long int u64 __attribute__((shared));
>> +
>> +int
>> +main()
>> +{
>> +  __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST);
>> +  __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST);
>> +
>> +  return 0;
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "atom.shared.exch.b32" 1 } } */
>> +/* { dg-final { scan-assembler-times "atom.shared.exch.b64" 1 } } */
>> +/* { dg-final { scan-assembler-times "membar.cta" 4 } } */
>> diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c
>> new file mode 100644
>> index 00000000000..cc0264f2b06
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c
>> @@ -0,0 +1,25 @@
>> +/* Test the atomic store expansion, global state space.  */
>> +
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-Wno-long-long" } */
>> +
>> +enum memmodel
>> +{
>> +  MEMMODEL_SEQ_CST = 5
>> +};
>> +
>> +unsigned int u32;
>> +unsigned long long int u64;
>> +
>> +int
>> +main()
>> +{
>> +  __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST);
>> +  __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST);
>> +
>> +  return 0;
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "st.global.u32" 1 } } */
>> +/* { dg-final { scan-assembler-times "st.global.u64" 1 } } */
>> +/* { dg-final { scan-assembler-times "membar.sys" 4 } } */
>> diff --git a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
>> index ad8e2f842fb..cd045964dfe 100644
>> --- a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
>> +++ b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
>> @@ -39,6 +39,10 @@ main (void)
>>     if (b != 1)
>>       __builtin_abort ();
>>
>> -
>> +  a = 1;
>> +  __atomic_store_n (&a, 0, MEMMODEL_RELAXED);
>> +  if (a != 0)
>> +    __builtin_abort ();
>> +
>>     return 0;
>>   }
> -----------------
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  
Thomas Schwinge Feb. 15, 2022, 11:08 a.m. UTC | #3
Hi Tom!

On 2022-02-15T11:52:29+0100, Tom de Vries <tdevries@suse.de> wrote:
> On 2/15/22 08:34, Thomas Schwinge wrote:
>> For my understanding:

Thanks for your explanations!

>> It is expected that this changes, for example (similar elsewhere)
>> 'nvptx-none/libatomic/store_4_.o', to use (a) 'atom.exch' (with a new
>> dummy register allocated)
>
> Yes.
>
> We could do slightly better by emitting that as:
> ...
> membar.sys;
> {  .reg .u32 dummy;
>     atom.exch.b32 dummy,[%r22],%r23;
> }
> membar.sys;
> ...
> which could improve register pressure.

Or, use the "bit bucket" operand -- assuming that's applicable here?

    atom.exch.b32 _,[%r22],%r23;

For example, see PTX 3.1, 8.2 "PTX Instructions".


Grüße
 Thomas


> I just wrote a patch for that (attached, ftr), but using a scratch
> register, and it seems that this similar code:
> ...
> void
> foo (U_4 *mptr, U_4 newval)
> {
>    __atomic_exchange_n (mptr, newval, 5);
> }
> ...
> still maps to:
> ...
> .reg .u32 %r24;
> membar.sys;
> atom.exch.b32 %r24,[%r22],%r23;
> membar.sys;
> ...
> so that may not be the right way to do it.

> --- a/gcc/config/nvptx/nvptx.md
> +++ b/gcc/config/nvptx/nvptx.md
> @@ -89,9 +89,10 @@
>  ;; only literal constants, which differ from the generic ones, which
>  ;; permit subregs and symbolc constants (as appropriate)
>  (define_predicate "nvptx_register_operand"
> -  (match_code "reg")
> +  (match_code "reg,scratch")
>  {
> -  return register_operand (op, mode);
> +  return (register_operand (op, mode)
> +       || (GET_CODE (op) == SCRATCH && GET_MODE (op) == mode));
>  })
>
>  (define_predicate "nvptx_nonimmediate_operand"
> @@ -188,7 +189,7 @@
>
>  (define_constraint "R"
>    "A pseudo register."
> -  (match_code "reg"))
> +  (ior (match_code "reg") (match_code "scratch")))
>
>  (define_constraint "Ia"
>    "Any integer constant."
> @@ -2036,6 +2037,7 @@
>       (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input
>    ""
>    {
> +    bool scratch_dst_p = GET_CODE (operands[0]) == SCRATCH;
>      if (nvptx_mem_local_p (operands[1]))
>        {
>       output_asm_insn ("{", NULL);
> @@ -2047,7 +2049,9 @@
>       return "";
>        }
>      const char *t
> -      = "%.\tatom%A1.exch.b%T0\t%0, %1, %2;";
> +      = (scratch_dst_p
> +      ? "{ .reg.u%T0 dummy; %.\tatom%A1.exch.b%T0\t dummy,%1, %2; }"
> +      : "%.\tatom%A1.exch.b%T0\t%0, %1, %2;");
>      return nvptx_output_atomic_insn (t, operands, 1, 3);
>    }
>    [(set_attr "atomic" "true")])
> @@ -2079,7 +2083,7 @@
>      /* Fall back to expand_atomic_store.  */
>      FAIL;
>
> -  rtx tmpreg = gen_reg_rtx (<MODE>mode);
> +  rtx tmpreg = gen_rtx_SCRATCH (<MODE>mode);
>    emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1],
>                                       operands[2]));
>    DONE;
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  
Tom de Vries Feb. 15, 2022, 12:08 p.m. UTC | #4
On 2/15/22 12:08, Thomas Schwinge wrote:
> Hi Tom!
> 
> On 2022-02-15T11:52:29+0100, Tom de Vries <tdevries@suse.de> wrote:
>> On 2/15/22 08:34, Thomas Schwinge wrote:
>>> For my understanding:
> 
> Thanks for your explanations!
> 
>>> It is expected that this changes, for example (similar elsewhere)
>>> 'nvptx-none/libatomic/store_4_.o', to use (a) 'atom.exch' (with a new
>>> dummy register allocated)
>>
>> Yes.
>>
>> We could do slightly better by emitting that as:
>> ...
>> membar.sys;
>> {  .reg .u32 dummy;
>>      atom.exch.b32 dummy,[%r22],%r23;
>> }
>> membar.sys;
>> ...
>> which could improve register pressure.
> 
> Or, use the "bit bucket" operand -- assuming that's applicable here?
> 
>      atom.exch.b32 _,[%r22],%r23;
> 

Ah, yes, that looks exactly what we need, thanks for pointing that out :)

I'll try to create a patch for this.

Thanks,
- Tom

> For example, see PTX 3.1, 8.2 "PTX Instructions".
> 
> 
> Grüße
>   Thomas
> 
> 
>> I just wrote a patch for that (attached, ftr), but using a scratch
>> register, and it seems that this similar code:
>> ...
>> void
>> foo (U_4 *mptr, U_4 newval)
>> {
>>     __atomic_exchange_n (mptr, newval, 5);
>> }
>> ...
>> still maps to:
>> ...
>> .reg .u32 %r24;
>> membar.sys;
>> atom.exch.b32 %r24,[%r22],%r23;
>> membar.sys;
>> ...
>> so that may not be the right way to do it.
> 
>> --- a/gcc/config/nvptx/nvptx.md
>> +++ b/gcc/config/nvptx/nvptx.md
>> @@ -89,9 +89,10 @@
>>   ;; only literal constants, which differ from the generic ones, which
>>   ;; permit subregs and symbolc constants (as appropriate)
>>   (define_predicate "nvptx_register_operand"
>> -  (match_code "reg")
>> +  (match_code "reg,scratch")
>>   {
>> -  return register_operand (op, mode);
>> +  return (register_operand (op, mode)
>> +       || (GET_CODE (op) == SCRATCH && GET_MODE (op) == mode));
>>   })
>>
>>   (define_predicate "nvptx_nonimmediate_operand"
>> @@ -188,7 +189,7 @@
>>
>>   (define_constraint "R"
>>     "A pseudo register."
>> -  (match_code "reg"))
>> +  (ior (match_code "reg") (match_code "scratch")))
>>
>>   (define_constraint "Ia"
>>     "Any integer constant."
>> @@ -2036,6 +2037,7 @@
>>        (match_operand:SDIM 2 "nvptx_nonmemory_operand" "Ri"))] ;; input
>>     ""
>>     {
>> +    bool scratch_dst_p = GET_CODE (operands[0]) == SCRATCH;
>>       if (nvptx_mem_local_p (operands[1]))
>>         {
>>        output_asm_insn ("{", NULL);
>> @@ -2047,7 +2049,9 @@
>>        return "";
>>         }
>>       const char *t
>> -      = "%.\tatom%A1.exch.b%T0\t%0, %1, %2;";
>> +      = (scratch_dst_p
>> +      ? "{ .reg.u%T0 dummy; %.\tatom%A1.exch.b%T0\t dummy,%1, %2; }"
>> +      : "%.\tatom%A1.exch.b%T0\t%0, %1, %2;");
>>       return nvptx_output_atomic_insn (t, operands, 1, 3);
>>     }
>>     [(set_attr "atomic" "true")])
>> @@ -2079,7 +2083,7 @@
>>       /* Fall back to expand_atomic_store.  */
>>       FAIL;
>>
>> -  rtx tmpreg = gen_reg_rtx (<MODE>mode);
>> +  rtx tmpreg = gen_rtx_SCRATCH (<MODE>mode);
>>     emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1],
>>                                        operands[2]));
>>     DONE;
> -----------------
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  

Patch

diff --git a/gcc/config/nvptx/nvptx-protos.h b/gcc/config/nvptx/nvptx-protos.h
index a846e341917..0bf9af406a2 100644
--- a/gcc/config/nvptx/nvptx-protos.h
+++ b/gcc/config/nvptx/nvptx-protos.h
@@ -60,5 +60,6 @@  extern const char *nvptx_output_simt_exit (rtx);
 extern const char *nvptx_output_red_partition (rtx, rtx);
 extern const char *nvptx_output_atomic_insn (const char *, rtx *, int, int);
 extern bool nvptx_mem_local_p (rtx);
+extern bool nvptx_mem_maybe_shared_p (const_rtx);
 #endif
 #endif
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 1b0227a2c31..5b26c0f4c7d 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -76,6 +76,7 @@ 
 #include "intl.h"
 #include "opts.h"
 #include "tree-pretty-print.h"
+#include "rtl-iter.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -2787,6 +2788,27 @@  nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
   nvptx_print_address_operand (file, addr, mode);
 }
 
+static nvptx_data_area
+nvptx_mem_data_area (const_rtx x)
+{
+  gcc_assert (GET_CODE (x) == MEM);
+
+  const_rtx addr = XEXP (x, 0);
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, addr, ALL)
+    if (SYMBOL_REF_P (*iter))
+      return SYMBOL_DATA_AREA (*iter);
+
+  return DATA_AREA_GENERIC;
+}
+
+bool
+nvptx_mem_maybe_shared_p (const_rtx x)
+{
+  nvptx_data_area area = nvptx_mem_data_area (x);
+  return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
+}
+
 /* Print an operand, X, to FILE, with an optional modifier in CODE.
 
    Meaning of CODE:
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index cced68e0d4a..1a283b41922 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -2051,6 +2051,36 @@  (define_insn "atomic_exchange<mode>"
   }
   [(set_attr "atomic" "true")])
 
+(define_expand "atomic_store<mode>"
+  [(match_operand:SDIM 0 "memory_operand" "=m")		  ;; memory
+   (match_operand:SDIM 1 "nvptx_nonmemory_operand" "Ri")  ;; input
+   (match_operand:SI 2 "const_int_operand")]		  ;; model
+  ""
+{
+  struct address_info info;
+  decompose_mem_address (&info, operands[0]);
+  if (info.base != NULL && REG_P (*info.base)
+      && REGNO_PTR_FRAME_P (REGNO (*info.base)))
+    {
+      emit_insn (gen_mov<mode> (operands[0], operands[1]));
+      DONE;
+    }
+
+  if (TARGET_SM70)
+    /* Fall back to expand_atomic_store.  */
+    FAIL;
+
+  bool maybe_shared_p = nvptx_mem_maybe_shared_p (operands[0]);
+  if (!maybe_shared_p)
+    /* Fall back to expand_atomic_store.  */
+    FAIL;
+
+  rtx tmpreg = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_atomic_exchange<mode> (tmpreg, operands[0], operands[1],
+					operands[2]));
+  DONE;
+})
+
 (define_insn "atomic_fetch_add<mode>"
   [(set (match_operand:SDIM 1 "memory_operand" "+m")
 	(unspec_volatile:SDIM
diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c
new file mode 100644
index 00000000000..cee3815eda5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-1.c
@@ -0,0 +1,26 @@ 
+/* Test the atomic store expansion for sm <= sm_6x targets,
+   shared state space.  */
+
+/* { dg-do compile } */
+/* { dg-options "-misa=sm_53" } */
+
+enum memmodel
+{
+  MEMMODEL_SEQ_CST = 5
+};
+
+unsigned int u32 __attribute__((shared));
+unsigned long long int u64 __attribute__((shared));
+
+int
+main()
+{
+  __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST);
+  __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "atom.shared.exch.b32" 1 } } */
+/* { dg-final { scan-assembler-times "atom.shared.exch.b64" 1 } } */
+/* { dg-final { scan-assembler-times "membar.cta" 4 } } */
diff --git a/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c
new file mode 100644
index 00000000000..cc0264f2b06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/atomic-store-3.c
@@ -0,0 +1,25 @@ 
+/* Test the atomic store expansion, global state space.  */
+
+/* { dg-do compile } */
+/* { dg-additional-options "-Wno-long-long" } */
+
+enum memmodel
+{
+  MEMMODEL_SEQ_CST = 5
+};
+
+unsigned int u32;
+unsigned long long int u64;
+
+int
+main()
+{
+  __atomic_store_n (&u32, 0, MEMMODEL_SEQ_CST);
+  __atomic_store_n (&u64, 0, MEMMODEL_SEQ_CST);
+
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "st.global.u32" 1 } } */
+/* { dg-final { scan-assembler-times "st.global.u64" 1 } } */
+/* { dg-final { scan-assembler-times "membar.sys" 4 } } */
diff --git a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
index ad8e2f842fb..cd045964dfe 100644
--- a/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
+++ b/gcc/testsuite/gcc.target/nvptx/stack-atomics-run.c
@@ -39,6 +39,10 @@  main (void)
   if (b != 1)
     __builtin_abort ();
 
-  
+  a = 1;
+  __atomic_store_n (&a, 0, MEMMODEL_RELAXED);
+  if (a != 0)
+    __builtin_abort ();
+
   return 0;
 }