Hi Andrew!
On 2022-03-08T11:30:57+0000, Hafiz Abid Qadeer <abidh@codesourcery.com> wrote:
> From: Andrew Stubbs <ams@codesourcery.com>
>
> This adds support for using Cuda Managed Memory with omp_alloc. It will be
> used as the underpinnings for "requires unified_shared_memory" in a later
> patch.
>
> There are two new predefined allocators, ompx_unified_shared_mem_alloc and
> ompx_host_mem_alloc, plus corresponding memory spaces, [...]
> --- a/libgomp/config/linux/allocator.c
> +++ b/libgomp/config/linux/allocator.c
> @@ -42,9 +42,11 @@
> static void *
> linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
> {
> - (void)memspace;
> -
> - if (pin)
> + if (memspace == ompx_unified_shared_mem_space)
> + {
> + return gomp_usm_alloc (size, GOMP_DEVICE_ICV);
> + }
> + else if (pin)
> {
> void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
> MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
This I understand conceptually, but then:
> @@ -67,7 +69,14 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
> static void *
> linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
> {
> - if (pin)
> + if (memspace == ompx_unified_shared_mem_space)
> + {
> + void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
> + memset (ret, 0, size);
> + return ret;
> + }
> + else if (memspace == ompx_unified_shared_mem_space
> + || pin)
> return linux_memspace_alloc (memspace, size, pin);
> else
> return calloc (1, size);
..., here, we've got a duplicated (and thus always-false) expression
'memspace == ompx_unified_shared_mem_space' (..., which
'-Wduplicated-cond' fails to report; <https://gcc.gnu.org/PR108753>
"'-Wduplicated-cond' doesn't diagnose duplicated subexpressions"...).
Is the correct fix the following (conceptually like
'linux_memspace_alloc' cited above), or is there something that I fail to
understand?
static void *
linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
{
if (memspace == ompx_unified_shared_mem_space)
{
void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
memset (ret, 0, size);
return ret;
}
- else if (memspace == ompx_unified_shared_mem_space
- || pin)
+ else if (pin)
return linux_memspace_alloc (memspace, size, pin);
else
return calloc (1, size);
The following ones then again are conceptually like
'linux_memspace_alloc' cited above:
> @@ -77,9 +86,9 @@ static void
> linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
> int pin)
> {
> - (void)memspace;
> -
> - if (pin)
> + if (memspace == ompx_unified_shared_mem_space)
> + gomp_usm_free (addr, GOMP_DEVICE_ICV);
> + else if (pin)
> munmap (addr, size);
> else
> free (addr);
> @@ -89,7 +98,9 @@ static void *
> linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
> size_t oldsize, size_t size, int oldpin, int pin)
> {
> - if (oldpin && pin)
> + if (memspace == ompx_unified_shared_mem_space)
> + goto manual_realloc;
> + else if (oldpin && pin)
> {
> void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
> if (newaddr == MAP_FAILED)
> @@ -98,18 +109,19 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
> [...]
..., and similar those here:
> --- a/libgomp/config/nvptx/allocator.c
> +++ b/libgomp/config/nvptx/allocator.c
> @@ -125,6 +125,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
> __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
> return result;
> }
> + else if (memspace == ompx_host_mem_space)
> + return NULL;
> else
> return malloc (size);
> }
> @@ -145,6 +147,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
>
> return result;
> }
> + else if (memspace == ompx_host_mem_space)
> + return NULL;
> else
> return calloc (1, size);
> }
> @@ -354,6 +358,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
> }
> return result;
> }
> + else if (memspace == ompx_host_mem_space)
> + return NULL;
> else
> return realloc (addr, size);
> }
(I'd have added an explicit no-op (or, 'abort'?) to
'nvptx_memspace_free', but that's maybe just me...) ;-\
> --- a/libgomp/libgomp.h
> +++ b/libgomp/libgomp.h
> +extern void * gomp_usm_alloc (size_t size, int device_num);
> +extern void gomp_usm_free (void *device_ptr, int device_num);
> +extern bool gomp_is_usm_ptr (void *ptr);
'gomp_is_usm_ptr' isn't defined/used anywhere; I'll remove it.
> --- a/libgomp/target.c
> +++ b/libgomp/target.c
> @@ -3740,6 +3807,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
> DLSYM (unload_image);
> DLSYM (alloc);
> DLSYM (free);
> + DLSYM_OPT (usm_alloc, usm_alloc);
> + DLSYM_OPT (usm_free, usm_free);
> + DLSYM_OPT (is_usm_ptr, is_usm_ptr);
> DLSYM (dev2host);
> DLSYM (host2dev);
As a sanity check, shouldn't we check that either none or all three of
those are defined, like in the 'if (cuda && cuda != 4) { [error] }' check
a bit further down?
Note that these remarks likewise apply to the current upstream
submission:
<https://inbox.sourceware.org/gcc-patches/ef374d055251b2bc65b97d7e54a0a72d811b869d.1657188329.git.ams@codesourcery.com>
"openmp, nvptx: ompx_unified_shared_mem_alloc".
Grüße
Thomas
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
On 10/02/2023 14:21, Thomas Schwinge wrote:
> Is the correct fix the following (conceptually like
> 'linux_memspace_alloc' cited above), or is there something that I fail to
> understand?
>
> static void *
> linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
> {
> if (memspace == ompx_unified_shared_mem_space)
> {
> void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
> memset (ret, 0, size);
> return ret;
> }
> - else if (memspace == ompx_unified_shared_mem_space
> - || pin)
> + else if (pin)
> return linux_memspace_alloc (memspace, size, pin);
> else
> return calloc (1, size);
Yes, I think that is what was intended (and what actually happens). You
can have your memory both unified and pinned (well, maybe it's possible,
but there's no one Cuda API for that), so the USM takes precedence.
> The following ones then again are conceptually like
> 'linux_memspace_alloc' cited above:
>
>> @@ -77,9 +86,9 @@ static void
>> linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
>> int pin)
>> {
>> - (void)memspace;
>> -
>> - if (pin)
>> + if (memspace == ompx_unified_shared_mem_space)
>> + gomp_usm_free (addr, GOMP_DEVICE_ICV);
>> + else if (pin)
>> munmap (addr, size);
>> else
>> free (addr);
>> @@ -89,7 +98,9 @@ static void *
>> linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
>> size_t oldsize, size_t size, int oldpin, int pin)
>> {
>> - if (oldpin && pin)
>> + if (memspace == ompx_unified_shared_mem_space)
>> + goto manual_realloc;
>> + else if (oldpin && pin)
>> {
>> void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
>> if (newaddr == MAP_FAILED)
>> @@ -98,18 +109,19 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
>> [...]
Yes.
> ..., and similar those here:
>
>> --- a/libgomp/config/nvptx/allocator.c
>> +++ b/libgomp/config/nvptx/allocator.c
>> @@ -125,6 +125,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
>> __atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
>> return result;
>> }
>> + else if (memspace == ompx_host_mem_space)
>> + return NULL;
>> else
>> return malloc (size);
>> }
>> @@ -145,6 +147,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
>>
>> return result;
>> }
>> + else if (memspace == ompx_host_mem_space)
>> + return NULL;
>> else
>> return calloc (1, size);
>> }
>> @@ -354,6 +358,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
>> }
>> return result;
>> }
>> + else if (memspace == ompx_host_mem_space)
>> + return NULL;
>> else
>> return realloc (addr, size);
>> }
>
> (I'd have added an explicit no-op (or, 'abort'?) to
> 'nvptx_memspace_free', but that's maybe just me...) ;-\
Why? The host memspace is just the regular heap, which can be a thing on
any device. It's an extension though so we can define it either way.
>> --- a/libgomp/libgomp.h
>> +++ b/libgomp/libgomp.h
>
>> +extern void * gomp_usm_alloc (size_t size, int device_num);
>> +extern void gomp_usm_free (void *device_ptr, int device_num);
>> +extern bool gomp_is_usm_ptr (void *ptr);
>
> 'gomp_is_usm_ptr' isn't defined/used anywhere; I'll remove it.
I think I started that and then decided against. Thanks.
>> --- a/libgomp/target.c
>> +++ b/libgomp/target.c
>
>> @@ -3740,6 +3807,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
>> DLSYM (unload_image);
>> DLSYM (alloc);
>> DLSYM (free);
>> + DLSYM_OPT (usm_alloc, usm_alloc);
>> + DLSYM_OPT (usm_free, usm_free);
>> + DLSYM_OPT (is_usm_ptr, is_usm_ptr);
>> DLSYM (dev2host);
>> DLSYM (host2dev);
>
> As a sanity check, shouldn't we check that either none or all three of
> those are defined, like in the 'if (cuda && cuda != 4) { [error] }' check
> a bit further down?
This is only going to happen when somebody writes a new plugin, and then
they'll discover very quickly that there are issues. I've wasted more
time writing this sentence than it's worth already. :)
> Note that these remarks likewise apply to the current upstream
> submission:
> <https://inbox.sourceware.org/gcc-patches/ef374d055251b2bc65b97d7e54a0a72d811b869d.1657188329.git.ams@codesourcery.com>> "openmp, nvptx: ompx_unified_shared_mem_alloc".
I have new patches to heap on top of this set already on OG12, and more
planned, plus these ones you're working on; the whole patchset is going
to have to get a rebase, squash, and tidy "soonish".
Andrew
@@ -32,7 +32,7 @@
#include <stdlib.h>
#include <string.h>
-#define omp_max_predefined_alloc ompx_pinned_mem_alloc
+#define omp_max_predefined_alloc ompx_host_mem_alloc
/* These macros may be overridden in config/<target>/allocator.c. */
#ifndef MEMSPACE_ALLOC
@@ -68,6 +68,8 @@ static const omp_memspace_handle_t predefined_alloc_mapping[] = {
omp_low_lat_mem_space, /* omp_pteam_mem_alloc. */
omp_low_lat_mem_space, /* omp_thread_mem_alloc. */
omp_default_mem_space, /* ompx_pinned_mem_alloc. */
+ ompx_unified_shared_mem_space, /* ompx_unified_shared_mem_alloc. */
+ ompx_host_mem_space, /* ompx_host_mem_alloc. */
};
struct omp_allocator_data
@@ -367,7 +369,8 @@ fail:
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_pinned_mem_alloc)
+ || allocator == ompx_pinned_mem_alloc
+ || allocator == ompx_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
@@ -597,7 +600,8 @@ fail:
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_pinned_mem_alloc)
+ || allocator == ompx_pinned_mem_alloc
+ || allocator == ompx_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
@@ -855,7 +859,8 @@ fail:
int fallback = (allocator_data
? allocator_data->fallback
: (allocator == omp_default_mem_alloc
- || allocator == ompx_pinned_mem_alloc)
+ || allocator == ompx_pinned_mem_alloc
+ || allocator == ompx_host_mem_alloc)
? omp_atv_null_fb
: omp_atv_default_mem_fb);
switch (fallback)
@@ -42,9 +42,11 @@
static void *
linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
{
- (void)memspace;
-
- if (pin)
+ if (memspace == ompx_unified_shared_mem_space)
+ {
+ return gomp_usm_alloc (size, GOMP_DEVICE_ICV);
+ }
+ else if (pin)
{
void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
@@ -67,7 +69,14 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
static void *
linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
{
- if (pin)
+ if (memspace == ompx_unified_shared_mem_space)
+ {
+ void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
+ memset (ret, 0, size);
+ return ret;
+ }
+ else if (memspace == ompx_unified_shared_mem_space
+ || pin)
return linux_memspace_alloc (memspace, size, pin);
else
return calloc (1, size);
@@ -77,9 +86,9 @@ static void
linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
int pin)
{
- (void)memspace;
-
- if (pin)
+ if (memspace == ompx_unified_shared_mem_space)
+ gomp_usm_free (addr, GOMP_DEVICE_ICV);
+ else if (pin)
munmap (addr, size);
else
free (addr);
@@ -89,7 +98,9 @@ static void *
linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
size_t oldsize, size_t size, int oldpin, int pin)
{
- if (oldpin && pin)
+ if (memspace == ompx_unified_shared_mem_space)
+ goto manual_realloc;
+ else if (oldpin && pin)
{
void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
if (newaddr == MAP_FAILED)
@@ -98,18 +109,19 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
return newaddr;
}
else if (oldpin || pin)
- {
- void *newaddr = linux_memspace_alloc (memspace, size, pin);
- if (newaddr)
- {
- memcpy (newaddr, addr, oldsize < size ? oldsize : size);
- linux_memspace_free (memspace, addr, oldsize, oldpin);
- }
-
- return newaddr;
- }
+ goto manual_realloc;
else
return realloc (addr, size);
+
+manual_realloc:
+ void *newaddr = linux_memspace_alloc (memspace, size, pin);
+ if (newaddr)
+ {
+ memcpy (newaddr, addr, oldsize < size ? oldsize : size);
+ linux_memspace_free (memspace, addr, oldsize, oldpin);
+ }
+
+ return newaddr;
}
#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
@@ -125,6 +125,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
__atomic_store_n (&__nvptx_lowlat_heap_root, root.raw, MEMMODEL_RELEASE);
return result;
}
+ else if (memspace == ompx_host_mem_space)
+ return NULL;
else
return malloc (size);
}
@@ -145,6 +147,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
return result;
}
+ else if (memspace == ompx_host_mem_space)
+ return NULL;
else
return calloc (1, size);
}
@@ -354,6 +358,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
}
return result;
}
+ else if (memspace == ompx_host_mem_space)
+ return NULL;
else
return realloc (addr, size);
}
@@ -134,6 +134,9 @@ extern int GOMP_OFFLOAD_load_image (int, unsigned, const void *,
extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *);
extern void *GOMP_OFFLOAD_alloc (int, size_t);
extern bool GOMP_OFFLOAD_free (int, void *);
+extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
+extern bool GOMP_OFFLOAD_usm_free (int, void *);
+extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
@@ -1013,6 +1013,9 @@ extern int gomp_pause_host (void);
extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
extern bool gomp_target_task_fn (void *);
+extern void * gomp_usm_alloc (size_t size, int device_num);
+extern void gomp_usm_free (void *device_ptr, int device_num);
+extern bool gomp_is_usm_ptr (void *ptr);
/* Splay tree definitions. */
typedef struct splay_tree_node_s *splay_tree_node;
@@ -1238,6 +1241,9 @@ struct gomp_device_descr
__typeof (GOMP_OFFLOAD_unload_image) *unload_image_func;
__typeof (GOMP_OFFLOAD_alloc) *alloc_func;
__typeof (GOMP_OFFLOAD_free) *free_func;
+ __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func;
+ __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func;
+ __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
__typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
__typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
@@ -120,6 +120,8 @@ typedef enum omp_memspace_handle_t __GOMP_UINTPTR_T_ENUM
omp_const_mem_space = 2,
omp_high_bw_mem_space = 3,
omp_low_lat_mem_space = 4,
+ ompx_unified_shared_mem_space = 5,
+ ompx_host_mem_space = 6,
__omp_memspace_handle_t_max__ = __UINTPTR_MAX__
} omp_memspace_handle_t;
@@ -135,6 +137,8 @@ typedef enum omp_allocator_handle_t __GOMP_UINTPTR_T_ENUM
omp_pteam_mem_alloc = 7,
omp_thread_mem_alloc = 8,
ompx_pinned_mem_alloc = 9,
+ ompx_unified_shared_mem_alloc = 10,
+ ompx_host_mem_alloc = 11,
__omp_allocator_handle_t_max__ = __UINTPTR_MAX__
} omp_allocator_handle_t;
@@ -160,6 +160,10 @@
parameter :: omp_thread_mem_alloc = 8
integer (kind=omp_allocator_handle_kind), &
parameter :: ompx_pinned_mem_alloc = 9
+ integer (kind=omp_allocator_handle_kind), &
+ parameter :: ompx_unified_shared_mem_alloc = 10
+ integer (kind=omp_allocator_handle_kind), &
+ parameter :: ompx_host_mem_alloc = 11
integer (omp_memspace_handle_kind), &
parameter :: omp_default_mem_space = 0
integer (omp_memspace_handle_kind), &
@@ -170,6 +174,10 @@
parameter :: omp_high_bw_mem_space = 3
integer (omp_memspace_handle_kind), &
parameter :: omp_low_lat_mem_space = 4
+ integer (omp_memspace_handle_kind), &
+ parameter :: omp_unified_shared_mem_space = 5
+ integer (omp_memspace_handle_kind), &
+ parameter :: omp_host_mem_space = 6
type omp_alloctrait
integer (kind=omp_alloctrait_key_kind) key
@@ -1042,11 +1042,13 @@ nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
}
static void *
-nvptx_alloc (size_t s, bool suppress_errors)
+nvptx_alloc (size_t s, bool suppress_errors, bool usm)
{
CUdeviceptr d;
- CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
+ CUresult r = (usm ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s,
+ CU_MEM_ATTACH_GLOBAL)
+ : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s));
if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
return NULL;
else if (r != CUDA_SUCCESS)
@@ -1423,8 +1425,8 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
return ret;
}
-void *
-GOMP_OFFLOAD_alloc (int ord, size_t size)
+static void *
+GOMP_OFFLOAD_alloc_1 (int ord, size_t size, bool usm)
{
if (!nvptx_attach_host_thread_to_device (ord))
return NULL;
@@ -1447,7 +1449,7 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
blocks = tmp;
}
- void *d = nvptx_alloc (size, true);
+ void *d = nvptx_alloc (size, true, usm);
if (d)
return d;
else
@@ -1455,10 +1457,22 @@ GOMP_OFFLOAD_alloc (int ord, size_t size)
/* Memory allocation failed. Try freeing the stacks block, and
retrying. */
nvptx_stacks_free (ptx_dev, true);
- return nvptx_alloc (size, false);
+ return nvptx_alloc (size, false, usm);
}
}
+void *
+GOMP_OFFLOAD_alloc (int ord, size_t size)
+{
+ return GOMP_OFFLOAD_alloc_1 (ord, size, false);
+}
+
+void *
+GOMP_OFFLOAD_usm_alloc (int ord, size_t size)
+{
+ return GOMP_OFFLOAD_alloc_1 (ord, size, true);
+}
+
bool
GOMP_OFFLOAD_free (int ord, void *ptr)
{
@@ -1466,6 +1480,25 @@ GOMP_OFFLOAD_free (int ord, void *ptr)
&& nvptx_free (ptr, ptx_devices[ord]));
}
+bool
+GOMP_OFFLOAD_usm_free (int ord, void *ptr)
+{
+ return GOMP_OFFLOAD_free (ord, ptr);
+}
+
+bool
+GOMP_OFFLOAD_is_usm_ptr (void *ptr)
+{
+ bool managed = false;
+ /* This returns 3 outcomes ...
+ CUDA_ERROR_INVALID_VALUE - Not a Cuda allocated pointer.
+ CUDA_SUCCESS, managed:false - Cuda allocated, but not USM.
+ CUDA_SUCCESS, managed:true - USM. */
+ CUDA_CALL_NOCHECK (cuPointerGetAttribute, &managed,
+ CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
+ return managed;
+}
+
void
GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
void **hostaddrs, void **devaddrs,
@@ -1030,6 +1030,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
tgt->list[i].offset = 0;
continue;
}
+ else if (devicep->is_usm_ptr_func
+ && devicep->is_usm_ptr_func (hostaddrs[i]))
+ {
+ /* The memory is visible from both host and target
+ so nothing needs to be moved. */
+ tgt->list[i].key = NULL;
+ tgt->list[i].offset = OFFSET_INLINED;
+ continue;
+ }
else if ((kind & typemask) == GOMP_MAP_STRUCT)
{
size_t first = i + 1;
@@ -1488,6 +1497,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
continue;
}
default:
+ if (tgt->list[i].offset == OFFSET_INLINED
+ && !array)
+ continue;
break;
}
splay_tree_key k = &array->key;
@@ -3323,6 +3335,61 @@ omp_target_free (void *device_ptr, int device_num)
gomp_mutex_unlock (&devicep->lock);
}
+void *
+gomp_usm_alloc (size_t size, int device_num)
+{
+ if (device_num == gomp_get_num_devices ())
+ return malloc (size);
+
+ struct gomp_device_descr *devicep = resolve_device (device_num);
+ if (devicep == NULL)
+ return NULL;
+
+ if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+ || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ return malloc (size);
+
+ void *ret = NULL;
+ gomp_mutex_lock (&devicep->lock);
+ if (devicep->usm_alloc_func)
+ ret = devicep->usm_alloc_func (devicep->target_id, size);
+ gomp_mutex_unlock (&devicep->lock);
+ return ret;
+}
+
+void
+gomp_usm_free (void *device_ptr, int device_num)
+{
+ if (device_ptr == NULL)
+ return;
+
+ if (device_num == gomp_get_num_devices ())
+ {
+ free (device_ptr);
+ return;
+ }
+
+ struct gomp_device_descr *devicep = resolve_device (device_num);
+ if (devicep == NULL)
+ return;
+
+ if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+ || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+ {
+ free (device_ptr);
+ return;
+ }
+
+ gomp_mutex_lock (&devicep->lock);
+ if (devicep->usm_free_func
+ && !devicep->usm_free_func (devicep->target_id, device_ptr))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ gomp_fatal ("error in freeing device memory block at %p", device_ptr);
+ }
+ gomp_mutex_unlock (&devicep->lock);
+}
+
int
omp_target_is_present (const void *ptr, int device_num)
{
@@ -3740,6 +3807,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
DLSYM (unload_image);
DLSYM (alloc);
DLSYM (free);
+ DLSYM_OPT (usm_alloc, usm_alloc);
+ DLSYM_OPT (usm_free, usm_free);
+ DLSYM_OPT (is_usm_ptr, is_usm_ptr);
DLSYM (dev2host);
DLSYM (host2dev);
device->capabilities = device->get_caps_func ();
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do run } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int), ompx_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ *a = 42;
+ uintptr_t a_p = (uintptr_t)a;
+
+ #pragma omp target is_device_ptr(a)
+ {
+ if (*a != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_unified_shared_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-do run } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+ #pragma omp target map(a[0])
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ #pragma omp target map(a[1])
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_unified_shared_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,35 @@
+/* { dg-do run } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target data map(a[0:2])
+ {
+#pragma omp target
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+ }
+
+ omp_free(a, ompx_unified_shared_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+
+#include <omp.h>
+#include <stdint.h>
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int)*2, ompx_unified_shared_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+ a[1] = 43;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target enter data map(to:a[0:2])
+
+#pragma omp target
+ {
+ if (a[0] != 42 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target
+ {
+ if (a[1] != 43 || a_p != (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+#pragma omp target exit data map(delete:a[0:2])
+
+ omp_free(a, ompx_unified_shared_mem_alloc);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-require-effective-target offload_device } */
+
+#include <omp.h>
+#include <stdint.h>
+
+#pragma omp requires unified_shared_memory
+
+int
+main ()
+{
+ int *a = (int *) omp_alloc(sizeof(int), ompx_host_mem_alloc);
+ if (!a)
+ __builtin_abort ();
+
+ a[0] = 42;
+
+ uintptr_t a_p = (uintptr_t)a;
+
+#pragma omp target map(a[0:1])
+ {
+ if (a[0] != 42 || a_p == (uintptr_t)a)
+ __builtin_abort ();
+ }
+
+ omp_free(a, ompx_host_mem_alloc);
+ return 0;
+}