[og12] Attempt to not just register but allocate OpenMP pinned memory using a device (was: [og12] Attempt to register OpenMP pinned memory using a device instead of 'mlock')

Message ID 87r0uktpds.fsf@euler.schwinge.homeip.net
State Deferred
Headers
Series [og12] Attempt to not just register but allocate OpenMP pinned memory using a device (was: [og12] Attempt to register OpenMP pinned memory using a device instead of 'mlock') |

Commit Message

Thomas Schwinge Feb. 20, 2023, 1:53 p.m. UTC
  Hi!

On 2023-02-20T09:48:53+0000, Andrew Stubbs <ams@codesourcery.com> wrote:
> On 17/02/2023 08:12, Thomas Schwinge wrote:
>> On 2023-02-16T23:06:44+0100, I wrote:
>>> On 2023-02-16T16:17:32+0000, "Stubbs, Andrew via Gcc-patches" <gcc-patches@gcc.gnu.org> wrote:
>>>> The mmap implementation was not optimized for a lot of small allocations, and I can't see that issue changing here
>>>
>>> That's correct, 'mmap' remains.  Under the hood, 'cuMemHostRegister' must
>>> surely also be doing some 'mlock'-like thing, so I figured it's best to
>>> feed page-boundary memory regions to it, which 'mmap' gets us.
>>>
>>>> so I don't know if this can be used for mlockall replacement.
>>>>
>>>> I had assumed that using the Cuda allocator would fix that limitation.
>>>
>>>  From what I've read (but no first-hand experiments), there's non-trivial
>>> overhead with 'cuMemHostRegister' (just like with 'mlock'), so routing
>>> all small allocations individually through it probably isn't a good idea
>>> either.  Therefore, I suppose, we'll indeed want to use some local
>>> allocator if we wish this "optimized for a lot of small allocations".
>>
>> Eh, I suppose your point indirectly was that instead of 'mmap' plus
>> 'cuMemHostRegister' we ought to use 'cuMemAllocHost'/'cuMemHostAlloc', as
>> we assume those already do implement such a local allocator.  Let me
>> quickly change that indeed -- we don't currently have a need to use
>> 'cuMemHostRegister' instead of 'cuMemAllocHost'/'cuMemHostAlloc'.
>
> Yes, that's right. I suppose it makes sense to register memory we
> already have, but if we want new memory then trying to reinvent what
> happens inside cuMemAllocHost is pointless.

I've pushed to devel/omp/gcc-12 branch
commit 4bd844f3e0202b3d083f0784f4343570c88bb86c
"Attempt to not just register but allocate OpenMP pinned memory using a device",
see attached.


Grüße
 Thomas


-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  

Patch

From 4bd844f3e0202b3d083f0784f4343570c88bb86c Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Mon, 20 Feb 2023 14:44:43 +0100
Subject: [PATCH] Attempt to not just register but allocate OpenMP pinned
 memory using a device

... instead of 'mmap' plus attempting to register using a device.

Implemented for nvptx offloading via 'cuMemHostAlloc'.

This re-works og12 commit a5a4800e92773da7126c00a9c79b172494d58ab5
"Attempt to register OpenMP pinned memory using a device instead of 'mlock'".

	include/
	* cuda/cuda.h (cuMemHostRegister, cuMemHostUnregister): Remove.
	libgomp/
	* config/linux/allocator.c (linux_memspace_alloc): Add 'init0'
	formal parameter.  Adjust all users.
	(linux_memspace_alloc, linux_memspace_free): Attempt to allocate
	OpenMP pinned memory using a device instead of 'mmap' plus
	attempting to register using a device.
	* libgomp-plugin.h (GOMP_OFFLOAD_register_page_locked)
	(GOMP_OFFLOAD_unregister_page_locked): Remove.
	(GOMP_OFFLOAD_page_locked_host_alloc)
	(GOMP_OFFLOAD_page_locked_host_free): New.
	* libgomp.h (gomp_register_page_locked)
	(gomp_unregister_page_locked): Remove.
	(gomp_page_locked_host_alloc, gomp_page_locked_host_free): New.
	(struct gomp_device_descr): Remove 'register_page_locked_func',
	'unregister_page_locked_func'.  Add 'page_locked_host_alloc_func',
	'page_locked_host_free_func'.
	* plugin/cuda-lib.def (cuMemHostRegister_v2, cuMemHostRegister)
	(cuMemHostUnregister): Remove.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_register_page_locked)
	(GOMP_OFFLOAD_unregister_page_locked): Remove.
	(GOMP_OFFLOAD_page_locked_host_alloc)
	(GOMP_OFFLOAD_page_locked_host_free): New.
	* target.c (gomp_register_page_locked)
	(gomp_unregister_page_locked): Remove.
	(gomp_page_locked_host_alloc, gomp_page_locked_host_free): Add.
	(gomp_load_plugin_for_device): Don't handle
	'register_page_locked', 'unregister_page_locked'.  Handle
	'page_locked_host_alloc', 'page_locked_host_free'.

Suggested-by: Andrew Stubbs <ams@codesourcery.com>
---
 include/cuda/cuda.h              |  3 --
 libgomp/config/linux/allocator.c | 85 ++++++++++++++++++--------------
 libgomp/libgomp-plugin.h         |  4 +-
 libgomp/libgomp.h                |  8 +--
 libgomp/plugin/cuda-lib.def      |  3 --
 libgomp/plugin/plugin-nvptx.c    | 33 +++++++------
 libgomp/target.c                 | 49 +++++++++---------
 7 files changed, 98 insertions(+), 87 deletions(-)

diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index b0c7636d318..062d394b95f 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -183,9 +183,6 @@  CUresult cuMemAlloc (CUdeviceptr *, size_t);
 CUresult cuMemAllocHost (void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
 CUresult cuMemHostAlloc (void **, size_t, unsigned int);
-#define cuMemHostRegister cuMemHostRegister_v2
-CUresult cuMemHostRegister(void *, size_t, unsigned int);
-CUresult cuMemHostUnregister(void *);
 CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
 #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c
index 81e64b268e9..3e1bd5a1285 100644
--- a/libgomp/config/linux/allocator.c
+++ b/libgomp/config/linux/allocator.c
@@ -25,8 +25,9 @@ 
 /* Implement malloc routines that can handle pinned memory on Linux.
 
    Given that pinned memory is typically used to help host <-> device memory
-   transfers, we attempt to register such using a device (really: libgomp
-   plugin), but fall back to mlock if no suitable device is available.
+   transfers, we attempt to allocate such memory using a device (really:
+   libgomp plugin), but fall back to mmap plus mlock if no suitable device is
+   available.
 
    It's possible to use mlock on any heap memory, but using munlock is
    problematic if there are multiple pinned allocations on the same page.
@@ -58,40 +59,36 @@  GOMP_enable_pinned_mode ()
     always_pinned_mode = true;
 }
 
-static int using_device_for_register_page_locked
+static int using_device_for_page_locked
   = /* uninitialized */ -1;
 
 static void *
-linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
+linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
+		      bool init0)
 {
-  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
-	      __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
+  gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d, init0=%d\n",
+	      __FUNCTION__, (unsigned long long) memspace,
+	      (unsigned long long) size, pin, init0);
 
   /* Explicit pinning may not be required.  */
   pin = pin && !always_pinned_mode;
 
+  void *addr;
+
   if (memspace == ompx_unified_shared_mem_space)
-    {
-      return gomp_usm_alloc (size, GOMP_DEVICE_ICV);
-    }
+    addr = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
   else if (pin)
     {
-      /* 'mmap' zero-initializes, which 'linux_memspace_calloc' relies on.  */
-      void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
-			 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-      if (addr == MAP_FAILED)
-	return NULL;
-
       int using_device
-	= __atomic_load_n (&using_device_for_register_page_locked,
+	= __atomic_load_n (&using_device_for_page_locked,
 			   MEMMODEL_RELAXED);
       gomp_debug (0, "  using_device=%d\n",
 		  using_device);
       if (using_device != 0)
 	{
-	  using_device = gomp_register_page_locked (addr, size);
+	  using_device = gomp_page_locked_host_alloc (&addr, size);
 	  int using_device_old
-	    = __atomic_exchange_n (&using_device_for_register_page_locked,
+	    = __atomic_exchange_n (&using_device_for_page_locked,
 				   using_device, MEMMODEL_RELAXED);
 	  gomp_debug (0, "  using_device=%d, using_device_old=%d\n",
 		      using_device, using_device_old);
@@ -101,19 +98,37 @@  linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
 	}
       if (using_device == 0)
 	{
-	  gomp_debug (0, "  mlock\n");
-	  if (mlock (addr, size))
+	  gomp_debug (0, "  mmap\n");
+	  addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
+		       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	  if (addr == MAP_FAILED)
+	    addr = NULL;
+	  else
 	    {
-	      gomp_debug (0, "libgomp: failed to pin memory (ulimit too low?)\n");
-	      munmap (addr, size);
-	      return NULL;
+	      /* 'mmap' zero-initializes.  */
+	      init0 = false;
+
+	      gomp_debug (0, "  mlock\n");
+	      if (mlock (addr, size))
+		{
+		  gomp_debug (0, "libgomp: failed to pin memory"
+			      " (ulimit too low?)\n");
+		  munmap (addr, size);
+		  addr = NULL;
+		}
 	    }
 	}
-
-      return addr;
     }
   else
-    return malloc (size);
+    addr = malloc (size);
+
+  if (addr && init0)
+    {
+      gomp_debug (0, "  init0\n");
+      memset (addr, 0, size);
+    }
+
+  return addr;
 }
 
 static void *
@@ -132,8 +147,7 @@  linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
       return ret;
     }
   else if (pin)
-    /* If PINned, 'linux_memspace_alloc' 'mmap's, which zero-initializes.  */
-    return linux_memspace_alloc (memspace, size, pin);
+    return linux_memspace_alloc (memspace, size, pin, true);
   else
     return calloc (1, size);
 }
@@ -153,16 +167,15 @@  linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
   else if (pin)
     {
       int using_device
-	= __atomic_load_n (&using_device_for_register_page_locked,
+	= __atomic_load_n (&using_device_for_page_locked,
 			   MEMMODEL_RELAXED);
       gomp_debug (0, "  using_device=%d\n",
 		  using_device);
       if (using_device == 1)
-	gomp_unregister_page_locked (addr, size);
+	gomp_page_locked_host_free (addr);
       else
 	/* 'munlock'ing is implicit with following 'munmap'.  */
-	;
-      munmap (addr, size);
+	munmap (addr, size);
     }
   else
     free (addr);
@@ -183,9 +196,9 @@  linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
   else if (oldpin && pin)
     {
       /* We can only expect to be able to just 'mremap' if not using a device
-	 for registering page-locked memory.  */
+	 for page-locked memory.  */
       int using_device
-	= __atomic_load_n (&using_device_for_register_page_locked,
+	= __atomic_load_n (&using_device_for_page_locked,
 		       MEMMODEL_RELAXED);
       gomp_debug (0, "  using_device=%d\n",
 		  using_device);
@@ -205,7 +218,7 @@  linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
     return realloc (addr, size);
 
 manual_realloc:
-  void *newaddr = linux_memspace_alloc (memspace, size, pin);
+  void *newaddr = linux_memspace_alloc (memspace, size, pin, false);
   if (newaddr)
     {
       memcpy (newaddr, addr, oldsize < size ? oldsize : size);
@@ -216,7 +229,7 @@  manual_realloc:
 }
 
 #define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
-  linux_memspace_alloc (MEMSPACE, SIZE, PIN)
+  linux_memspace_alloc (MEMSPACE, SIZE, PIN, false)
 #define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
   linux_memspace_calloc (MEMSPACE, SIZE, PIN)
 #define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 345fc62d4f5..66d995f33e8 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -144,8 +144,8 @@  extern bool GOMP_OFFLOAD_free (int, void *);
 extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
 extern bool GOMP_OFFLOAD_usm_free (int, void *);
 extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
-extern bool GOMP_OFFLOAD_register_page_locked (void *, size_t);
-extern bool GOMP_OFFLOAD_unregister_page_locked (void *, size_t);
+extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
+extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
 extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index a5fa3f9daab..ba12d558465 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1133,8 +1133,8 @@  extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
 			     void *);
 extern void * gomp_usm_alloc (size_t size, int device_num);
 extern void gomp_usm_free (void *device_ptr, int device_num);
-extern bool gomp_register_page_locked (void *, size_t);
-extern void gomp_unregister_page_locked (void *, size_t);
+extern bool gomp_page_locked_host_alloc (void **, size_t);
+extern void gomp_page_locked_host_free (void *);
 
 /* Splay tree definitions.  */
 typedef struct splay_tree_node_s *splay_tree_node;
@@ -1394,8 +1394,8 @@  struct gomp_device_descr
   __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func;
   __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func;
   __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
-  __typeof (GOMP_OFFLOAD_register_page_locked) *register_page_locked_func;
-  __typeof (GOMP_OFFLOAD_unregister_page_locked) *unregister_page_locked_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
   __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
   __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
   __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 8dbaadf848e..9b786c9f2f6 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -31,9 +31,6 @@  CUDA_ONE_CALL (cuMemAlloc)
 CUDA_ONE_CALL (cuMemAllocHost)
 CUDA_ONE_CALL (cuMemAllocManaged)
 CUDA_ONE_CALL (cuMemHostAlloc)
-CUDA_ONE_CALL_MAYBE_NULL (cuMemHostRegister_v2)
-CUDA_ONE_CALL (cuMemHostRegister)
-CUDA_ONE_CALL (cuMemHostUnregister)
 CUDA_ONE_CALL (cuMemcpy)
 CUDA_ONE_CALL (cuMemcpyDtoDAsync)
 CUDA_ONE_CALL (cuMemcpyDtoH)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 698317f37ac..a7896e4dabe 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -77,14 +77,11 @@  extern CUresult cuGetErrorString (CUresult, const char **);
 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
 			const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
-#undef cuMemHostRegister
-CUresult cuMemHostRegister (void *, size_t, unsigned int);
 #else
 typedef size_t (*CUoccupancyB2DSize)(int);
 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
 			   const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
-CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
 					  CUoccupancyB2DSize, size_t, int);
 #endif
@@ -1709,30 +1706,36 @@  GOMP_OFFLOAD_is_usm_ptr (void *ptr)
 
 
 bool
-GOMP_OFFLOAD_register_page_locked (void *ptr, size_t size)
+GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
 {
   GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
 		     __FUNCTION__, ptr, (unsigned long long) size);
 
+  CUresult r;
+
   unsigned int flags = 0;
   /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
-     'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here.  */
-  if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
-    CUDA_CALL (cuMemHostRegister_v2, ptr, size, flags);
-  else
-    CUDA_CALL (cuMemHostRegister, ptr, size, flags);
-
+     'flags |= CU_MEMHOSTALLOC_PORTABLE;' here.  */
+  r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
+  if (r == CUDA_ERROR_OUT_OF_MEMORY)
+    *ptr = NULL;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
+      return false;
+    }
+  GOMP_PLUGIN_debug (0, "  -> *ptr=%p\n",
+		     *ptr);
   return true;
 }
 
 bool
-GOMP_OFFLOAD_unregister_page_locked (void *ptr, size_t size)
+GOMP_OFFLOAD_page_locked_host_free (void *ptr)
 {
-  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
-		     __FUNCTION__, ptr, (unsigned long long) size);
-
-  CUDA_CALL (cuMemHostUnregister, ptr);
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n",
+		     __FUNCTION__, ptr);
 
+  CUDA_CALL (cuMemFreeHost, ptr);
   return true;
 }
 
diff --git a/libgomp/target.c b/libgomp/target.c
index e7285188d1e..24109f28ddc 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -4585,15 +4585,15 @@  gomp_usm_free (void *device_ptr, int device_num)
 }
 
 
-/* Device (really: libgomp plugin) for registering paged-locked memory.  We
+/* Device (really: libgomp plugin) to use for paged-locked memory.  We
    assume there is either none or exactly one such device for the lifetime of
    the process.  */
 
-static struct gomp_device_descr *device_for_register_page_locked
+static struct gomp_device_descr *device_for_page_locked
   = /* uninitialized */ (void *) -1;
 
 static struct gomp_device_descr *
-get_device_for_register_page_locked (void)
+get_device_for_page_locked (void)
 {
   gomp_debug (0, "%s\n",
 	      __FUNCTION__);
@@ -4601,7 +4601,7 @@  get_device_for_register_page_locked (void)
   struct gomp_device_descr *device;
 #ifdef HAVE_SYNC_BUILTINS
   device
-    = __atomic_load_n (&device_for_register_page_locked, MEMMODEL_RELAXED);
+    = __atomic_load_n (&device_for_page_locked, MEMMODEL_RELAXED);
   if (device == (void *) -1)
     {
       gomp_debug (0, "  init\n");
@@ -4621,7 +4621,7 @@  get_device_for_register_page_locked (void)
 	  if (devices[i].target_id != 0)
 	    continue;
 
-	  if (!devices[i].register_page_locked_func)
+	  if (!devices[i].page_locked_host_alloc_func)
 	    continue;
 
 	  gomp_debug (0, "  found device: %p (%s)\n",
@@ -4629,16 +4629,16 @@  get_device_for_register_page_locked (void)
 	  if (device)
 	    gomp_fatal ("Unclear how %s and %s libgomp plugins may"
 			" simultaneously provide functionality"
-			" to register page-locked memory",
+			" for page-locked memory",
 			device->name, devices[i].name);
 	  else
 	    device = &devices[i];
 	}
 
       struct gomp_device_descr *device_old
-	= __atomic_exchange_n (&device_for_register_page_locked, device,
+	= __atomic_exchange_n (&device_for_page_locked, device,
 			       MEMMODEL_RELAXED);
-      gomp_debug (0, "  old device_for_register_page_locked: %p\n",
+      gomp_debug (0, "  old device_for_page_locked: %p\n",
 		  device_old);
       assert (device_old == (void *) -1
 	      /* We shouldn't have concurrently found a different or no
@@ -4647,7 +4647,7 @@  get_device_for_register_page_locked (void)
     }
 #else /* !HAVE_SYNC_BUILTINS */
   gomp_debug (0, "  not implemented for '!HAVE_SYNC_BUILTINS'\n");
-  (void) &device_for_register_page_locked;
+  (void) &device_for_page_locked;
   device = NULL;
 #endif /* HAVE_SYNC_BUILTINS */
 
@@ -4656,16 +4656,16 @@  get_device_for_register_page_locked (void)
   return device;
 }
 
-/* Register page-locked memory region.
+/* Allocate page-locked host memory.
    Returns whether we have a device capable of that.  */
 
 attribute_hidden bool
-gomp_register_page_locked (void *ptr, size_t size)
+gomp_page_locked_host_alloc (void **ptr, size_t size)
 {
   gomp_debug (0, "%s: ptr=%p, size=%llu\n",
 	      __FUNCTION__, ptr, (unsigned long long) size);
 
-  struct gomp_device_descr *device = get_device_for_register_page_locked ();
+  struct gomp_device_descr *device = get_device_for_page_locked ();
   gomp_debug (0, "  device=%p (%s)\n",
 	      device, device ? device->name : "[none]");
   if (device)
@@ -4676,29 +4676,30 @@  gomp_register_page_locked (void *ptr, size_t size)
       else if (device->state == GOMP_DEVICE_FINALIZED)
 	{
 	  gomp_mutex_unlock (&device->lock);
-	  gomp_fatal ("Device %s for registering page-locked memory"
-		      " is finalized", device->name);
+	  gomp_fatal ("Device %s used for for page-locked memory is finalized",
+		      device->name);
 	}
       gomp_mutex_unlock (&device->lock);
 
-      if (!device->register_page_locked_func (ptr, size))
-	gomp_fatal ("Failed to register page-locked memory"
+      if (!device->page_locked_host_alloc_func (ptr, size))
+	gomp_fatal ("Failed to allocate page-locked host memory"
 		    " via %s libgomp plugin",
 		    device->name);
     }
   return device != NULL;
 }
 
-/* Unregister page-locked memory region.
-   This must only be called if 'gomp_register_page_locked' returned 'true'.  */
+/* Free page-locked host memory.
+   This must only be called if 'gomp_page_locked_host_alloc' returned
+   'true'.  */
 
 attribute_hidden void
-gomp_unregister_page_locked (void *ptr, size_t size)
+gomp_page_locked_host_free (void *ptr)
 {
   gomp_debug (0, "%s: ptr=%p\n",
 	      __FUNCTION__, ptr);
 
-  struct gomp_device_descr *device = get_device_for_register_page_locked ();
+  struct gomp_device_descr *device = get_device_for_page_locked ();
   gomp_debug (0, "  device=%p (%s)\n",
 	      device, device ? device->name : "[none]");
   assert (device);
@@ -4712,8 +4713,8 @@  gomp_unregister_page_locked (void *ptr, size_t size)
     }
   gomp_mutex_unlock (&device->lock);
 
-  if (!device->unregister_page_locked_func (ptr, size))
-    gomp_fatal ("Failed to unregister page-locked memory"
+  if (!device->page_locked_host_free_func (ptr))
+    gomp_fatal ("Failed to free page-locked host memory"
 		" via %s libgomp plugin",
 		device->name);
 }
@@ -5403,8 +5404,8 @@  gomp_load_plugin_for_device (struct gomp_device_descr *device,
   DLSYM_OPT (usm_alloc, usm_alloc);
   DLSYM_OPT (usm_free, usm_free);
   DLSYM_OPT (is_usm_ptr, is_usm_ptr);
-  DLSYM_OPT (register_page_locked, register_page_locked);
-  DLSYM_OPT (unregister_page_locked, unregister_page_locked);
+  DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
+  DLSYM_OPT (page_locked_host_free, page_locked_host_free);
   DLSYM (dev2host);
   DLSYM (host2dev);
   DLSYM (evaluate_device);
-- 
2.25.1