libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation (was: [Patch] libgomp/nvptx: Prepare for reverse-offload callback handling)

Message ID 87r0ti9k3o.fsf@euler.schwinge.homeip.net
State Committed
Headers
Series libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation (was: [Patch] libgomp/nvptx: Prepare for reverse-offload callback handling) |

Commit Message

Thomas Schwinge March 21, 2023, 3:53 p.m. UTC
  Hi!

On 2022-08-26T11:07:28+0200, Tobias Burnus <tobias@codesourcery.com> wrote:
> This patch adds initial [OpenMP reverse offload] support for nvptx.

> CUDA does lockup when trying to copy data from the currently running
> stream; hence, a new stream is generated to do the memory copying.

As part of other work, where I had to touch those special code paths, I
found that we may reduce complexity a little bit "by using the existing
'goacc_asyncqueue' instead of re-coding parts of it".  OK to push
"libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation"
(still testing), see attached?


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  

Comments

Tobias Burnus April 28, 2023, 8:48 a.m. UTC | #1
Hi Thomas,

On 21.03.23 16:53, Thomas Schwinge wrote:
> On 2022-08-26T11:07:28+0200, Tobias Burnus <tobias@codesourcery.com>
> wrote:
>> This patch adds initial [OpenMP reverse offload] support for nvptx.
>> CUDA does lockup when trying to copy data from the currently running
>> stream; hence, a new stream is generated to do the memory copying.
> As part of other work, where I had to touch those special code paths, I
> found that we may reduce complexity a little bit "by using the existing
> 'goacc_asyncqueue' instead of re-coding parts of it".  OK to push
> "libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation"
> (still testing), see attached?

I don't think that just calling "exit (EXIT_FAILURE);" is the the proper
way – I think that should be GOMP_PLUGIN_fatal in the plugin and
gomp_fatal in target.c.

Otherwise, it LGTM.

Tobias

> Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device
>   memory copy implementation
>
> ... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.
>
> Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
> "libgomp/nvptx: Prepare for reverse-offload callback handling",
> and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
> "libgomp: Handle OpenMP's reverse offloads".
>
>       libgomp/
>       * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
>       'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
>       * libgomp.h (gomp_target_rev): Adjust.
>       * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
>       * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
>       * plugin/plugin-gcn.c (process_reverse_offload): Adjust.
>       * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
>       (rev_off_host_to_dev_cpy): Remove.
>       (GOMP_OFFLOAD_run): Adjust.
> ---
>   libgomp/libgomp-plugin.c      |   7 +--
>   libgomp/libgomp-plugin.h      |   6 +-
>   libgomp/libgomp.h             |   5 +-
>   libgomp/plugin/plugin-gcn.c   |   2 +-
>   libgomp/plugin/plugin-nvptx.c |  77 ++++++++++++++-----------
>   libgomp/target.c              | 102 +++++++++++++++-------------------
>   6 files changed, 96 insertions(+), 103 deletions(-)
>
> diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
> index 27e7c94ba9b..d696515eeb6 100644
> --- a/libgomp/libgomp-plugin.c
> +++ b/libgomp/libgomp-plugin.c
> @@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
>   void
>   GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>                       uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
> -                     void (*dev_to_host_cpy) (void *, const void *, size_t,
> -                                              void *),
> -                     void (*host_to_dev_cpy) (void *, const void *, size_t,
> -                                              void *), void *token)
> +                     struct goacc_asyncqueue *aq)
>   {
>     gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
> -                dev_to_host_cpy, host_to_dev_cpy, token);
> +                aq);
>   }
> diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
> index 28267f75f7a..42ee3d6c7f9 100644
> --- a/libgomp/libgomp-plugin.h
> +++ b/libgomp/libgomp-plugin.h
> @@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
>       __attribute__ ((noreturn, format (printf, 1, 2)));
>
>   extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
> -                                 uint64_t, int,
> -                                 void (*) (void *, const void *, size_t,
> -                                           void *),
> -                                 void (*) (void *, const void *, size_t,
> -                                           void *), void *);
> +                                 uint64_t, int, struct goacc_asyncqueue *);
>
>   /* Prototypes for functions implemented by libgomp plugins.  */
>   extern const char *GOMP_OFFLOAD_get_name (void);
> diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
> index ba8fe348aba..4d2bfab4b71 100644
> --- a/libgomp/libgomp.h
> +++ b/libgomp/libgomp.h
> @@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void);
>   extern int gomp_get_num_devices (void);
>   extern bool gomp_target_task_fn (void *);
>   extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
> -                          int,
> -                          void (*) (void *, const void *, size_t, void *),
> -                          void (*) (void *, const void *, size_t, void *),
> -                          void *);
> +                          int, struct goacc_asyncqueue *);
>
>   /* Splay tree definitions.  */
>   typedef struct splay_tree_node_s *splay_tree_node;
> diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
> index 347803762eb..2181bf0235f 100644
> --- a/libgomp/plugin/plugin-gcn.c
> +++ b/libgomp/plugin/plugin-gcn.c
> @@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
>   {
>     int dev_num = dev_num64;
>     GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
> -                       NULL, NULL, NULL);
> +                       NULL);
>   }
>
>   /* Output any data written to console output from the kernel.  It is expected
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 5bd5a419e0e..4a710851ee5 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -56,6 +56,7 @@
>   #include <unistd.h>
>   #include <assert.h>
>   #include <errno.h>
> +#include <stdlib.h>
>
>   /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
>      block to cache between kernel invocations.  For soft-stacks blocks bigger
> @@ -1739,11 +1740,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
>     return 1;
>   }
>
> -struct goacc_asyncqueue *
> -GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
> +static struct goacc_asyncqueue *
> +nvptx_goacc_asyncqueue_construct (unsigned int flags)
>   {
>     CUstream stream = NULL;
> -  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
> +  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
>
>     struct goacc_asyncqueue *aq
>       = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
> @@ -1751,14 +1752,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
>     return aq;
>   }
>
> -bool
> -GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
> +struct goacc_asyncqueue *
> +GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
> +{
> +  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
> +}
> +
> +static bool
> +nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
>   {
>     CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
>     free (aq);
>     return true;
>   }
>
> +bool
> +GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
> +{
> +  return nvptx_goacc_asyncqueue_destruct (aq);
> +}
> +
>   int
>   GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
>   {
> @@ -1772,13 +1785,19 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
>     return -1;
>   }
>
> -bool
> -GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
> +static bool
> +nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
>   {
>     CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
>     return true;
>   }
>
> +bool
> +GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
> +{
> +  return nvptx_goacc_asyncqueue_synchronize (aq);
> +}
> +
>   bool
>   GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
>                                     struct goacc_asyncqueue *aq2)
> @@ -2038,22 +2057,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
>   }
>
>
> -void
> -rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
> -                      CUstream stream)
> -{
> -  CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
> -  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
> -}
> -
> -void
> -rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
> -                      CUstream stream)
> -{
> -  CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
> -  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
> -}
> -
>   void
>   GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>   {
> @@ -2087,9 +2090,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>       }
>     nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
>
> -  size_t stack_size = nvptx_stacks_size ();
>     bool reverse_offload = ptx_dev->rev_data != NULL;
> -  CUstream copy_stream = NULL;
> +  struct goacc_asyncqueue *reverse_offload_aq = NULL;
> +  if (reverse_offload)
> +    {
> +      reverse_offload_aq
> +     = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
> +      if (!reverse_offload_aq)
> +     exit (EXIT_FAILURE);
> +    }
> +
> +  size_t stack_size = nvptx_stacks_size ();
>
>     pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
>     void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
> @@ -2103,8 +2114,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>     GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
>                    " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
>                    __FUNCTION__, fn_name, teams, threads);
> -  if (reverse_offload)
> -    CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
>     r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
>                        32, threads, 1, 0, NULL, NULL, config);
>     if (r != CUDA_SUCCESS)
> @@ -2127,17 +2136,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>           GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
>                                   rev_data->addrs, rev_data->sizes,
>                                   rev_data->kinds, rev_data->dev_num,
> -                                 rev_off_dev_to_host_cpy,
> -                                 rev_off_host_to_dev_cpy, copy_stream);
> -         CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
> +                                 reverse_offload_aq);
> +         if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
> +           exit (EXIT_FAILURE);
>           __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
>         }
>       usleep (1);
>         }
>     else
>       r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
> -  if (reverse_offload)
> -    CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
>     if (r == CUDA_ERROR_LAUNCH_FAILED)
>       GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
>                      maybe_abort_msg);
> @@ -2145,6 +2152,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>       GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
>
>     pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
> +
> +  if (reverse_offload)
> +    {
> +      if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
> +     exit (EXIT_FAILURE);
> +    }
>   }
>
>   /* TODO: Implement GOMP_OFFLOAD_async_run. */
> diff --git a/libgomp/target.c b/libgomp/target.c
> index 79ed64a5dc3..e02188cf7e1 100644
> --- a/libgomp/target.c
> +++ b/libgomp/target.c
> @@ -3312,9 +3312,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
>   void
>   gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>                uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
> -              void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
> -              void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
> -              void *token)
> +              struct goacc_asyncqueue *aq)
>   {
>     /* Return early if there is no offload code.  */
>     if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
> @@ -3356,26 +3354,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>         devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
>         sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
>         kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
> -      if (dev_to_host_cpy)
> -     {
> -       dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
> -                        mapnum * sizeof (uint64_t), token);
> -       dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
> -                        mapnum * sizeof (uint64_t), token);
> -       dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
> -                        mapnum * sizeof (unsigned short), token);
> -     }
> -      else
> -     {
> -       gomp_copy_dev2host (devicep, NULL, devaddrs,
> -                           (const void *) (uintptr_t) devaddrs_ptr,
> -                           mapnum * sizeof (uint64_t));
> -       gomp_copy_dev2host (devicep, NULL, sizes,
> -                           (const void *) (uintptr_t) sizes_ptr,
> -                           mapnum * sizeof (uint64_t));
> -       gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
> -                           mapnum * sizeof (unsigned short));
> -     }
> +      gomp_copy_dev2host (devicep, aq, devaddrs,
> +                       (const void *) (uintptr_t) devaddrs_ptr,
> +                       mapnum * sizeof (uint64_t));
> +      gomp_copy_dev2host (devicep, aq, sizes,
> +                       (const void *) (uintptr_t) sizes_ptr,
> +                       mapnum * sizeof (uint64_t));
> +      gomp_copy_dev2host (devicep, aq, kinds,
> +                       (const void *) (uintptr_t) kinds_ptr,
> +                       mapnum * sizeof (unsigned short));
> +      if (aq && !devicep->openacc.async.synchronize_func (aq))
> +     exit (EXIT_FAILURE);
>       }
>
>     size_t tgt_align = 0, tgt_size = 0;
> @@ -3402,13 +3391,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>           if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
>             memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
>                     (size_t) sizes[i]);
> -         else if (dev_to_host_cpy)
> -           dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
> -                            (size_t) sizes[i], token);
>           else
> -           gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
> -                               (void *) (uintptr_t) devaddrs[i],
> -                               (size_t) sizes[i]);
> +           {
> +             gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
> +                                 (void *) (uintptr_t) devaddrs[i],
> +                                 (size_t) sizes[i]);
> +             if (aq && !devicep->openacc.async.synchronize_func (aq))
> +               exit (EXIT_FAILURE);
> +           }
>           devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
>           tgt_size = tgt_size + sizes[i];
>           if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
> @@ -3498,15 +3488,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>                   || kind == GOMP_MAP_ALWAYS_TO
>                   || kind == GOMP_MAP_ALWAYS_TOFROM)
>                 {
> -                 if (dev_to_host_cpy)
> -                   dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
> -                                    (void *) (uintptr_t) cdata[i].devaddr,
> -                                    sizes[i], token);
> -                 else
> -                   gomp_copy_dev2host (devicep, NULL,
> -                                       (void *) (uintptr_t) devaddrs[i],
> -                                       (void *) (uintptr_t) cdata[i].devaddr,
> -                                       sizes[i]);
> +                 gomp_copy_dev2host (devicep, aq,
> +                                     (void *) (uintptr_t) devaddrs[i],
> +                                     (void *) (uintptr_t) cdata[i].devaddr,
> +                                     sizes[i]);
> +                 if (aq && !devicep->openacc.async.synchronize_func (aq))
> +                   {
> +                     gomp_mutex_unlock (&devicep->lock);
> +                     exit (EXIT_FAILURE);
> +                   }
>                 }
>               if (struct_cpy)
>                 struct_cpy--;
> @@ -3573,15 +3563,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>                   devaddrs[i]
>                     = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
>                                                                  sizes[i]);
> -                 if (dev_to_host_cpy)
> -                   dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
> -                                    (void *) (uintptr_t) cdata[i].devaddr,
> -                                    sizes[i], token);
> -                 else
> -                   gomp_copy_dev2host (devicep, NULL,
> -                                       (void *) (uintptr_t) devaddrs[i],
> -                                       (void *) (uintptr_t) cdata[i].devaddr,
> -                                       sizes[i]);
> +                 gomp_copy_dev2host (devicep, aq,
> +                                     (void *) (uintptr_t) devaddrs[i],
> +                                     (void *) (uintptr_t) cdata[i].devaddr,
> +                                     sizes[i]);
> +                 if (aq && !devicep->openacc.async.synchronize_func (aq))
> +                   {
> +                     gomp_mutex_unlock (&devicep->lock);
> +                     exit (EXIT_FAILURE);
> +                   }
>                 }
>               for (j = i + 1; j < mapnum; j++)
>                 {
> @@ -3685,15 +3675,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>               /* FALLTHRU */
>             case GOMP_MAP_FROM:
>             case GOMP_MAP_TOFROM:
> -             if (copy && host_to_dev_cpy)
> -               host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
> -                                (void *) (uintptr_t) devaddrs[i],
> -                                sizes[i], token);
> -             else if (copy)
> -               gomp_copy_host2dev (devicep, NULL,
> -                                   (void *) (uintptr_t) cdata[i].devaddr,
> -                                   (void *) (uintptr_t) devaddrs[i],
> -                                   sizes[i], false, NULL);
> +             if (copy)
> +               {
> +                 gomp_copy_host2dev (devicep, aq,
> +                                     (void *) (uintptr_t) cdata[i].devaddr,
> +                                     (void *) (uintptr_t) devaddrs[i],
> +                                     sizes[i], false, NULL);
> +                 if (aq && !devicep->openacc.async.synchronize_func (aq))
> +                   exit (EXIT_FAILURE);
> +               }
>             default:
>               break;
>           }
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  
Thomas Schwinge April 28, 2023, 9:31 a.m. UTC | #2
Hi Tobias!

On 2023-04-28T10:48:31+0200, Tobias Burnus <tobias@codesourcery.com> wrote:
> On 21.03.23 16:53, Thomas Schwinge wrote:
>> On 2022-08-26T11:07:28+0200, Tobias Burnus <tobias@codesourcery.com>
>> wrote:
>>> This patch adds initial [OpenMP reverse offload] support for nvptx.
>>> CUDA does lockup when trying to copy data from the currently running
>>> stream; hence, a new stream is generated to do the memory copying.
>> As part of other work, where I had to touch those special code paths, I
>> found that we may reduce complexity a little bit "by using the existing
>> 'goacc_asyncqueue' instead of re-coding parts of it".  OK to push
>> "libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation"
>> (still testing), see attached?
>
> I don't think that just calling "exit (EXIT_FAILURE);" is the the proper
> way

The point is, when we run into such an 'exit', we've already issued an
error (in the plugin, via 'GOMP_PLUGIN_fatal'), and then (to replicate
what 'GOMP_PLUGIN_fatal'/'gomp_fatal' do) we just need to 'exit' -- after
unlocking.  The latter is the reason why we can't just do this:

> – I think that should be GOMP_PLUGIN_fatal in the plugin and
> gomp_fatal in target.c.

..., because we'd dead-lock due to 'atexit' shutdown of devices etc.,
while still having devices etc. locked.

(Resolving all this differently/"properly" is for another day.)

> Otherwise, it LGTM.

Thanks.  OK to push then, given the rationale above?


Grüße
 Thomas


>> Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device
>>   memory copy implementation
>>
>> ... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.
>>
>> Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
>> "libgomp/nvptx: Prepare for reverse-offload callback handling",
>> and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
>> "libgomp: Handle OpenMP's reverse offloads".
>>
>>       libgomp/
>>       * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
>>       'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
>>       * libgomp.h (gomp_target_rev): Adjust.
>>       * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
>>       * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
>>       * plugin/plugin-gcn.c (process_reverse_offload): Adjust.
>>       * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
>>       (rev_off_host_to_dev_cpy): Remove.
>>       (GOMP_OFFLOAD_run): Adjust.
>> ---
>>   libgomp/libgomp-plugin.c      |   7 +--
>>   libgomp/libgomp-plugin.h      |   6 +-
>>   libgomp/libgomp.h             |   5 +-
>>   libgomp/plugin/plugin-gcn.c   |   2 +-
>>   libgomp/plugin/plugin-nvptx.c |  77 ++++++++++++++-----------
>>   libgomp/target.c              | 102 +++++++++++++++-------------------
>>   6 files changed, 96 insertions(+), 103 deletions(-)
>>
>> diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
>> index 27e7c94ba9b..d696515eeb6 100644
>> --- a/libgomp/libgomp-plugin.c
>> +++ b/libgomp/libgomp-plugin.c
>> @@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
>>   void
>>   GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>>                       uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
>> -                     void (*dev_to_host_cpy) (void *, const void *, size_t,
>> -                                              void *),
>> -                     void (*host_to_dev_cpy) (void *, const void *, size_t,
>> -                                              void *), void *token)
>> +                     struct goacc_asyncqueue *aq)
>>   {
>>     gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
>> -                dev_to_host_cpy, host_to_dev_cpy, token);
>> +                aq);
>>   }
>> diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
>> index 28267f75f7a..42ee3d6c7f9 100644
>> --- a/libgomp/libgomp-plugin.h
>> +++ b/libgomp/libgomp-plugin.h
>> @@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
>>       __attribute__ ((noreturn, format (printf, 1, 2)));
>>
>>   extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
>> -                                 uint64_t, int,
>> -                                 void (*) (void *, const void *, size_t,
>> -                                           void *),
>> -                                 void (*) (void *, const void *, size_t,
>> -                                           void *), void *);
>> +                                 uint64_t, int, struct goacc_asyncqueue *);
>>
>>   /* Prototypes for functions implemented by libgomp plugins.  */
>>   extern const char *GOMP_OFFLOAD_get_name (void);
>> diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
>> index ba8fe348aba..4d2bfab4b71 100644
>> --- a/libgomp/libgomp.h
>> +++ b/libgomp/libgomp.h
>> @@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void);
>>   extern int gomp_get_num_devices (void);
>>   extern bool gomp_target_task_fn (void *);
>>   extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
>> -                          int,
>> -                          void (*) (void *, const void *, size_t, void *),
>> -                          void (*) (void *, const void *, size_t, void *),
>> -                          void *);
>> +                          int, struct goacc_asyncqueue *);
>>
>>   /* Splay tree definitions.  */
>>   typedef struct splay_tree_node_s *splay_tree_node;
>> diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
>> index 347803762eb..2181bf0235f 100644
>> --- a/libgomp/plugin/plugin-gcn.c
>> +++ b/libgomp/plugin/plugin-gcn.c
>> @@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
>>   {
>>     int dev_num = dev_num64;
>>     GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
>> -                       NULL, NULL, NULL);
>> +                       NULL);
>>   }
>>
>>   /* Output any data written to console output from the kernel.  It is expected
>> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
>> index 5bd5a419e0e..4a710851ee5 100644
>> --- a/libgomp/plugin/plugin-nvptx.c
>> +++ b/libgomp/plugin/plugin-nvptx.c
>> @@ -56,6 +56,7 @@
>>   #include <unistd.h>
>>   #include <assert.h>
>>   #include <errno.h>
>> +#include <stdlib.h>
>>
>>   /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
>>      block to cache between kernel invocations.  For soft-stacks blocks bigger
>> @@ -1739,11 +1740,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
>>     return 1;
>>   }
>>
>> -struct goacc_asyncqueue *
>> -GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
>> +static struct goacc_asyncqueue *
>> +nvptx_goacc_asyncqueue_construct (unsigned int flags)
>>   {
>>     CUstream stream = NULL;
>> -  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
>> +  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
>>
>>     struct goacc_asyncqueue *aq
>>       = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
>> @@ -1751,14 +1752,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
>>     return aq;
>>   }
>>
>> -bool
>> -GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
>> +struct goacc_asyncqueue *
>> +GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
>> +{
>> +  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
>> +}
>> +
>> +static bool
>> +nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
>>   {
>>     CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
>>     free (aq);
>>     return true;
>>   }
>>
>> +bool
>> +GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
>> +{
>> +  return nvptx_goacc_asyncqueue_destruct (aq);
>> +}
>> +
>>   int
>>   GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
>>   {
>> @@ -1772,13 +1785,19 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
>>     return -1;
>>   }
>>
>> -bool
>> -GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
>> +static bool
>> +nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
>>   {
>>     CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
>>     return true;
>>   }
>>
>> +bool
>> +GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
>> +{
>> +  return nvptx_goacc_asyncqueue_synchronize (aq);
>> +}
>> +
>>   bool
>>   GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
>>                                     struct goacc_asyncqueue *aq2)
>> @@ -2038,22 +2057,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
>>   }
>>
>>
>> -void
>> -rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
>> -                      CUstream stream)
>> -{
>> -  CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
>> -  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
>> -}
>> -
>> -void
>> -rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
>> -                      CUstream stream)
>> -{
>> -  CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
>> -  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
>> -}
>> -
>>   void
>>   GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>>   {
>> @@ -2087,9 +2090,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>>       }
>>     nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
>>
>> -  size_t stack_size = nvptx_stacks_size ();
>>     bool reverse_offload = ptx_dev->rev_data != NULL;
>> -  CUstream copy_stream = NULL;
>> +  struct goacc_asyncqueue *reverse_offload_aq = NULL;
>> +  if (reverse_offload)
>> +    {
>> +      reverse_offload_aq
>> +     = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
>> +      if (!reverse_offload_aq)
>> +     exit (EXIT_FAILURE);
>> +    }
>> +
>> +  size_t stack_size = nvptx_stacks_size ();
>>
>>     pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
>>     void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
>> @@ -2103,8 +2114,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>>     GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
>>                    " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
>>                    __FUNCTION__, fn_name, teams, threads);
>> -  if (reverse_offload)
>> -    CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
>>     r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
>>                        32, threads, 1, 0, NULL, NULL, config);
>>     if (r != CUDA_SUCCESS)
>> @@ -2127,17 +2136,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>>           GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
>>                                   rev_data->addrs, rev_data->sizes,
>>                                   rev_data->kinds, rev_data->dev_num,
>> -                                 rev_off_dev_to_host_cpy,
>> -                                 rev_off_host_to_dev_cpy, copy_stream);
>> -         CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
>> +                                 reverse_offload_aq);
>> +         if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
>> +           exit (EXIT_FAILURE);
>>           __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
>>         }
>>       usleep (1);
>>         }
>>     else
>>       r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
>> -  if (reverse_offload)
>> -    CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
>>     if (r == CUDA_ERROR_LAUNCH_FAILED)
>>       GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
>>                      maybe_abort_msg);
>> @@ -2145,6 +2152,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>>       GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
>>
>>     pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
>> +
>> +  if (reverse_offload)
>> +    {
>> +      if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
>> +     exit (EXIT_FAILURE);
>> +    }
>>   }
>>
>>   /* TODO: Implement GOMP_OFFLOAD_async_run. */
>> diff --git a/libgomp/target.c b/libgomp/target.c
>> index 79ed64a5dc3..e02188cf7e1 100644
>> --- a/libgomp/target.c
>> +++ b/libgomp/target.c
>> @@ -3312,9 +3312,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
>>   void
>>   gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>>                uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
>> -              void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
>> -              void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
>> -              void *token)
>> +              struct goacc_asyncqueue *aq)
>>   {
>>     /* Return early if there is no offload code.  */
>>     if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
>> @@ -3356,26 +3354,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>>         devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
>>         sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
>>         kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
>> -      if (dev_to_host_cpy)
>> -     {
>> -       dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
>> -                        mapnum * sizeof (uint64_t), token);
>> -       dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
>> -                        mapnum * sizeof (uint64_t), token);
>> -       dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
>> -                        mapnum * sizeof (unsigned short), token);
>> -     }
>> -      else
>> -     {
>> -       gomp_copy_dev2host (devicep, NULL, devaddrs,
>> -                           (const void *) (uintptr_t) devaddrs_ptr,
>> -                           mapnum * sizeof (uint64_t));
>> -       gomp_copy_dev2host (devicep, NULL, sizes,
>> -                           (const void *) (uintptr_t) sizes_ptr,
>> -                           mapnum * sizeof (uint64_t));
>> -       gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
>> -                           mapnum * sizeof (unsigned short));
>> -     }
>> +      gomp_copy_dev2host (devicep, aq, devaddrs,
>> +                       (const void *) (uintptr_t) devaddrs_ptr,
>> +                       mapnum * sizeof (uint64_t));
>> +      gomp_copy_dev2host (devicep, aq, sizes,
>> +                       (const void *) (uintptr_t) sizes_ptr,
>> +                       mapnum * sizeof (uint64_t));
>> +      gomp_copy_dev2host (devicep, aq, kinds,
>> +                       (const void *) (uintptr_t) kinds_ptr,
>> +                       mapnum * sizeof (unsigned short));
>> +      if (aq && !devicep->openacc.async.synchronize_func (aq))
>> +     exit (EXIT_FAILURE);
>>       }
>>
>>     size_t tgt_align = 0, tgt_size = 0;
>> @@ -3402,13 +3391,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>>           if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
>>             memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
>>                     (size_t) sizes[i]);
>> -         else if (dev_to_host_cpy)
>> -           dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
>> -                            (size_t) sizes[i], token);
>>           else
>> -           gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
>> -                               (void *) (uintptr_t) devaddrs[i],
>> -                               (size_t) sizes[i]);
>> +           {
>> +             gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
>> +                                 (void *) (uintptr_t) devaddrs[i],
>> +                                 (size_t) sizes[i]);
>> +             if (aq && !devicep->openacc.async.synchronize_func (aq))
>> +               exit (EXIT_FAILURE);
>> +           }
>>           devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
>>           tgt_size = tgt_size + sizes[i];
>>           if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
>> @@ -3498,15 +3488,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>>                   || kind == GOMP_MAP_ALWAYS_TO
>>                   || kind == GOMP_MAP_ALWAYS_TOFROM)
>>                 {
>> -                 if (dev_to_host_cpy)
>> -                   dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
>> -                                    (void *) (uintptr_t) cdata[i].devaddr,
>> -                                    sizes[i], token);
>> -                 else
>> -                   gomp_copy_dev2host (devicep, NULL,
>> -                                       (void *) (uintptr_t) devaddrs[i],
>> -                                       (void *) (uintptr_t) cdata[i].devaddr,
>> -                                       sizes[i]);
>> +                 gomp_copy_dev2host (devicep, aq,
>> +                                     (void *) (uintptr_t) devaddrs[i],
>> +                                     (void *) (uintptr_t) cdata[i].devaddr,
>> +                                     sizes[i]);
>> +                 if (aq && !devicep->openacc.async.synchronize_func (aq))
>> +                   {
>> +                     gomp_mutex_unlock (&devicep->lock);
>> +                     exit (EXIT_FAILURE);
>> +                   }
>>                 }
>>               if (struct_cpy)
>>                 struct_cpy--;
>> @@ -3573,15 +3563,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>>                   devaddrs[i]
>>                     = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
>>                                                                  sizes[i]);
>> -                 if (dev_to_host_cpy)
>> -                   dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
>> -                                    (void *) (uintptr_t) cdata[i].devaddr,
>> -                                    sizes[i], token);
>> -                 else
>> -                   gomp_copy_dev2host (devicep, NULL,
>> -                                       (void *) (uintptr_t) devaddrs[i],
>> -                                       (void *) (uintptr_t) cdata[i].devaddr,
>> -                                       sizes[i]);
>> +                 gomp_copy_dev2host (devicep, aq,
>> +                                     (void *) (uintptr_t) devaddrs[i],
>> +                                     (void *) (uintptr_t) cdata[i].devaddr,
>> +                                     sizes[i]);
>> +                 if (aq && !devicep->openacc.async.synchronize_func (aq))
>> +                   {
>> +                     gomp_mutex_unlock (&devicep->lock);
>> +                     exit (EXIT_FAILURE);
>> +                   }
>>                 }
>>               for (j = i + 1; j < mapnum; j++)
>>                 {
>> @@ -3685,15 +3675,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>>               /* FALLTHRU */
>>             case GOMP_MAP_FROM:
>>             case GOMP_MAP_TOFROM:
>> -             if (copy && host_to_dev_cpy)
>> -               host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
>> -                                (void *) (uintptr_t) devaddrs[i],
>> -                                sizes[i], token);
>> -             else if (copy)
>> -               gomp_copy_host2dev (devicep, NULL,
>> -                                   (void *) (uintptr_t) cdata[i].devaddr,
>> -                                   (void *) (uintptr_t) devaddrs[i],
>> -                                   sizes[i], false, NULL);
>> +             if (copy)
>> +               {
>> +                 gomp_copy_host2dev (devicep, aq,
>> +                                     (void *) (uintptr_t) cdata[i].devaddr,
>> +                                     (void *) (uintptr_t) devaddrs[i],
>> +                                     sizes[i], false, NULL);
>> +                 if (aq && !devicep->openacc.async.synchronize_func (aq))
>> +                   exit (EXIT_FAILURE);
>> +               }
>>             default:
>>               break;
>>           }
> -----------------
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  
Tobias Burnus April 28, 2023, 10:51 a.m. UTC | #3
On 28.04.23 11:31, Thomas Schwinge wrote:
> On 2023-04-28T10:48:31+0200, Tobias Burnus <tobias@codesourcery.com> wrote:
>> I don't think that just calling "exit (EXIT_FAILURE);" is the the proper
>> way
> The point is, when we run into such an 'exit', we've already issued an
> error (in the plugin, via 'GOMP_PLUGIN_fatal'),
you meant: GOMP_PLUGIN_error.
> and then (to replicate
> what 'GOMP_PLUGIN_fatal'/'gomp_fatal' do) we just need to 'exit' -- after
> unlocking.  The latter is the reason why we can't just do this:
>
>> – I think that should be GOMP_PLUGIN_fatal in the plugin and
>> gomp_fatal in target.c.
> ..., because we'd dead-lock due to 'atexit' shutdown of devices etc.,
> while still having devices etc. locked.
>
> (Resolving all this differently/"properly" is for another day.)
→ https://gcc.gnu.org/PR109664
>> Otherwise, it LGTM.
> Thanks.  OK to push then, given the rationale above?

OK.

Tobias

-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  

Patch

From 65636e924f69a146e571e7a7009304803e24ca1a Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Tue, 21 Mar 2023 16:14:16 +0100
Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device
 memory copy implementation

... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.

Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
"libgomp/nvptx: Prepare for reverse-offload callback handling",
and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
"libgomp: Handle OpenMP's reverse offloads".

	libgomp/
	* target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
	'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
	* libgomp.h (gomp_target_rev): Adjust.
	* libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
	* libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
	* plugin/plugin-gcn.c (process_reverse_offload): Adjust.
	* plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
	(rev_off_host_to_dev_cpy): Remove.
	(GOMP_OFFLOAD_run): Adjust.
---
 libgomp/libgomp-plugin.c      |   7 +--
 libgomp/libgomp-plugin.h      |   6 +-
 libgomp/libgomp.h             |   5 +-
 libgomp/plugin/plugin-gcn.c   |   2 +-
 libgomp/plugin/plugin-nvptx.c |  77 ++++++++++++++-----------
 libgomp/target.c              | 102 +++++++++++++++-------------------
 6 files changed, 96 insertions(+), 103 deletions(-)

diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
index 27e7c94ba9b..d696515eeb6 100644
--- a/libgomp/libgomp-plugin.c
+++ b/libgomp/libgomp-plugin.c
@@ -82,11 +82,8 @@  GOMP_PLUGIN_fatal (const char *msg, ...)
 void
 GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
 			uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
-			void (*dev_to_host_cpy) (void *, const void *, size_t,
-						 void *),
-			void (*host_to_dev_cpy) (void *, const void *, size_t,
-						 void *), void *token)
+			struct goacc_asyncqueue *aq)
 {
   gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
-		   dev_to_host_cpy, host_to_dev_cpy, token);
+		   aq);
 }
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 28267f75f7a..42ee3d6c7f9 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -121,11 +121,7 @@  extern void GOMP_PLUGIN_fatal (const char *, ...)
 	__attribute__ ((noreturn, format (printf, 1, 2)));
 
 extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
-				    uint64_t, int,
-				    void (*) (void *, const void *, size_t,
-					      void *),
-				    void (*) (void *, const void *, size_t,
-					      void *), void *);
+				    uint64_t, int, struct goacc_asyncqueue *);
 
 /* Prototypes for functions implemented by libgomp plugins.  */
 extern const char *GOMP_OFFLOAD_get_name (void);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index ba8fe348aba..4d2bfab4b71 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1130,10 +1130,7 @@  extern void gomp_init_targets_once (void);
 extern int gomp_get_num_devices (void);
 extern bool gomp_target_task_fn (void *);
 extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
-			     int,
-			     void (*) (void *, const void *, size_t, void *),
-			     void (*) (void *, const void *, size_t, void *),
-			     void *);
+			     int, struct goacc_asyncqueue *);
 
 /* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 347803762eb..2181bf0235f 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -1949,7 +1949,7 @@  process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
 {
   int dev_num = dev_num64;
   GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
-			  NULL, NULL, NULL);
+			  NULL);
 }
 
 /* Output any data written to console output from the kernel.  It is expected
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5bd5a419e0e..4a710851ee5 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -56,6 +56,7 @@ 
 #include <unistd.h>
 #include <assert.h>
 #include <errno.h>
+#include <stdlib.h>
 
 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
    block to cache between kernel invocations.  For soft-stacks blocks bigger
@@ -1739,11 +1740,11 @@  GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
   return 1;
 }
 
-struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+static struct goacc_asyncqueue *
+nvptx_goacc_asyncqueue_construct (unsigned int flags)
 {
   CUstream stream = NULL;
-  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
 
   struct goacc_asyncqueue *aq
     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
@@ -1751,14 +1752,26 @@  GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
   return aq;
 }
 
-bool
-GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+{
+  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+}
+
+static bool
+nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
 {
   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
   free (aq);
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  return nvptx_goacc_asyncqueue_destruct (aq);
+}
+
 int
 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
 {
@@ -1772,13 +1785,19 @@  GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
   return -1;
 }
 
-bool
-GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+static bool
+nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
 {
   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  return nvptx_goacc_asyncqueue_synchronize (aq);
+}
+
 bool
 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
 				      struct goacc_asyncqueue *aq2)
@@ -2038,22 +2057,6 @@  nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
 }
 
 
-void
-rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
-			 CUstream stream)
-{
-  CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
-  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
-rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
-			 CUstream stream)
-{
-  CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
-  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
 void
 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 {
@@ -2087,9 +2090,17 @@  GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
     }
   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
 
-  size_t stack_size = nvptx_stacks_size ();
   bool reverse_offload = ptx_dev->rev_data != NULL;
-  CUstream copy_stream = NULL;
+  struct goacc_asyncqueue *reverse_offload_aq = NULL;
+  if (reverse_offload)
+    {
+      reverse_offload_aq
+	= nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
+      if (!reverse_offload_aq)
+	exit (EXIT_FAILURE);
+    }
+
+  size_t stack_size = nvptx_stacks_size ();
 
   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
@@ -2103,8 +2114,6 @@  GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
 		     __FUNCTION__, fn_name, teams, threads);
-  if (reverse_offload)
-    CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
 			 32, threads, 1, 0, NULL, NULL, config);
   if (r != CUDA_SUCCESS)
@@ -2127,17 +2136,15 @@  GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
 	    GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
 				    rev_data->addrs, rev_data->sizes,
 				    rev_data->kinds, rev_data->dev_num,
-				    rev_off_dev_to_host_cpy,
-				    rev_off_host_to_dev_cpy, copy_stream);
-	    CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
+				    reverse_offload_aq);
+	    if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
+	      exit (EXIT_FAILURE);
 	    __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
 	  }
 	usleep (1);
       }
   else
     r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
-  if (reverse_offload)
-    CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
   if (r == CUDA_ERROR_LAUNCH_FAILED)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
 		       maybe_abort_msg);
@@ -2145,6 +2152,12 @@  GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
 
   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+
+  if (reverse_offload)
+    {
+      if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
+	exit (EXIT_FAILURE);
+    }
 }
 
 /* TODO: Implement GOMP_OFFLOAD_async_run. */
diff --git a/libgomp/target.c b/libgomp/target.c
index 79ed64a5dc3..e02188cf7e1 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -3312,9 +3312,7 @@  gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
 void
 gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
 		 uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
-		 void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
-		 void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
-		 void *token)
+		 struct goacc_asyncqueue *aq)
 {
   /* Return early if there is no offload code.  */
   if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3356,26 +3354,17 @@  gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
       devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
       sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
       kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
-      if (dev_to_host_cpy)
-	{
-	  dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
-			   mapnum * sizeof (uint64_t), token);
-	  dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
-			   mapnum * sizeof (uint64_t), token);
-	  dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
-			   mapnum * sizeof (unsigned short), token);
-	}
-      else
-	{
-	  gomp_copy_dev2host (devicep, NULL, devaddrs,
-			      (const void *) (uintptr_t) devaddrs_ptr,
-			      mapnum * sizeof (uint64_t));
-	  gomp_copy_dev2host (devicep, NULL, sizes,
-			      (const void *) (uintptr_t) sizes_ptr,
-			      mapnum * sizeof (uint64_t));
-	  gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
-			      mapnum * sizeof (unsigned short));
-	}
+      gomp_copy_dev2host (devicep, aq, devaddrs,
+			  (const void *) (uintptr_t) devaddrs_ptr,
+			  mapnum * sizeof (uint64_t));
+      gomp_copy_dev2host (devicep, aq, sizes,
+			  (const void *) (uintptr_t) sizes_ptr,
+			  mapnum * sizeof (uint64_t));
+      gomp_copy_dev2host (devicep, aq, kinds,
+			  (const void *) (uintptr_t) kinds_ptr,
+			  mapnum * sizeof (unsigned short));
+      if (aq && !devicep->openacc.async.synchronize_func (aq))
+	exit (EXIT_FAILURE);
     }
 
   size_t tgt_align = 0, tgt_size = 0;
@@ -3402,13 +3391,14 @@  gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
 	    if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
 	      memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
 		      (size_t) sizes[i]);
-	    else if (dev_to_host_cpy)
-	      dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
-			       (size_t) sizes[i], token);
 	    else
-	      gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
-				  (void *) (uintptr_t) devaddrs[i],
-				  (size_t) sizes[i]);
+	      {
+		gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
+				    (void *) (uintptr_t) devaddrs[i],
+				    (size_t) sizes[i]);
+		if (aq && !devicep->openacc.async.synchronize_func (aq))
+		  exit (EXIT_FAILURE);
+	      }
 	    devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
 	    tgt_size = tgt_size + sizes[i];
 	    if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
@@ -3498,15 +3488,15 @@  gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
 		    || kind == GOMP_MAP_ALWAYS_TO
 		    || kind == GOMP_MAP_ALWAYS_TOFROM)
 		  {
-		    if (dev_to_host_cpy)
-		      dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
-				       (void *) (uintptr_t) cdata[i].devaddr,
-				       sizes[i], token);
-		    else
-		      gomp_copy_dev2host (devicep, NULL,
-					  (void *) (uintptr_t) devaddrs[i],
-					  (void *) (uintptr_t) cdata[i].devaddr,
-					  sizes[i]);
+		    gomp_copy_dev2host (devicep, aq,
+					(void *) (uintptr_t) devaddrs[i],
+					(void *) (uintptr_t) cdata[i].devaddr,
+					sizes[i]);
+		    if (aq && !devicep->openacc.async.synchronize_func (aq))
+		      {
+			gomp_mutex_unlock (&devicep->lock);
+			exit (EXIT_FAILURE);
+		      }
 		  }
 		if (struct_cpy)
 		  struct_cpy--;
@@ -3573,15 +3563,15 @@  gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
 		    devaddrs[i]
 		      = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
 								   sizes[i]);
-		    if (dev_to_host_cpy)
-		      dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
-				       (void *) (uintptr_t) cdata[i].devaddr,
-				       sizes[i], token);
-		    else
-		      gomp_copy_dev2host (devicep, NULL,
-					  (void *) (uintptr_t) devaddrs[i],
-					  (void *) (uintptr_t) cdata[i].devaddr,
-					  sizes[i]);
+		    gomp_copy_dev2host (devicep, aq,
+					(void *) (uintptr_t) devaddrs[i],
+					(void *) (uintptr_t) cdata[i].devaddr,
+					sizes[i]);
+		    if (aq && !devicep->openacc.async.synchronize_func (aq))
+		      {
+			gomp_mutex_unlock (&devicep->lock);
+			exit (EXIT_FAILURE);
+		      }
 		  }
 		for (j = i + 1; j < mapnum; j++)
 		  {
@@ -3685,15 +3675,15 @@  gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
 		/* FALLTHRU */
 	      case GOMP_MAP_FROM:
 	      case GOMP_MAP_TOFROM:
-		if (copy && host_to_dev_cpy)
-		  host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
-				   (void *) (uintptr_t) devaddrs[i],
-				   sizes[i], token);
-		else if (copy)
-		  gomp_copy_host2dev (devicep, NULL,
-				      (void *) (uintptr_t) cdata[i].devaddr,
-				      (void *) (uintptr_t) devaddrs[i],
-				      sizes[i], false, NULL);
+		if (copy)
+		  {
+		    gomp_copy_host2dev (devicep, aq,
+					(void *) (uintptr_t) cdata[i].devaddr,
+					(void *) (uintptr_t) devaddrs[i],
+					sizes[i], false, NULL);
+		    if (aq && !devicep->openacc.async.synchronize_func (aq))
+		      exit (EXIT_FAILURE);
+		  }
 	      default:
 		break;
 	    }
-- 
2.25.1