[v3,3/3] amdgcn, libgomp: low-latency allocator

Message ID 20231203003224.1638841-4-ams@codesourcery.com
State Superseded
Headers
Series libgomp: OpenMP low-latency omp_alloc |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm fail Testing failed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 fail Testing failed

Commit Message

Andrew Stubbs Dec. 3, 2023, 12:32 a.m. UTC
  This implements the OpenMP low-latency memory allocator for AMD GCN using the
small per-team LDS memory (Local Data Store).

Since addresses can now refer to LDS space, the "Global" address space is
no-longer compatible.  This patch therefore switches the backend to use
entirely "Flat" addressing (which supports both memories).  A future patch
will re-enable "global" instructions for cases where it is known to be safe
to do so.

gcc/ChangeLog:

	* config/gcn/gcn-builtins.def (DISPATCH_PTR): New built-in.
	* config/gcn/gcn.cc (gcn_init_machine_status): Disable global
	addressing.
	(gcn_expand_builtin_1): Implement GCN_BUILTIN_DISPATCH_PTR.

libgomp/ChangeLog:

	* config/gcn/libgomp-gcn.h (TEAM_ARENA_START): Move to here.
	(TEAM_ARENA_FREE): Likewise.
	(TEAM_ARENA_END): Likewise.
	(GCN_LOWLAT_HEAP): New.
	* config/gcn/team.c (LITTLEENDIAN_CPU): New, and import hsa.h.
	(__gcn_lowlat_init): New prototype.
	(gomp_gcn_enter_kernel): Initialize the low-latency heap.
	* libgomp.h (TEAM_ARENA_START): Move to libgomp.h.
	(TEAM_ARENA_FREE): Likewise.
	(TEAM_ARENA_END): Likewise.
	* plugin/plugin-gcn.c (lowlat_size): New variable.
	(print_kernel_dispatch): Label the group_segment_size purpose.
	(init_environment_variables): Read GOMP_GCN_LOWLAT_POOL.
	(create_kernel_dispatch): Pass low-latency head allocation to kernel.
	(run_kernel): Use shadow; don't assume values.
	* testsuite/libgomp.c/omp_alloc-traits.c: Enable for amdgcn.
	* config/gcn/allocator.c: New file.
	* libgomp.texi: Document low-latency implementation details.
---
 gcc/config/gcn/gcn-builtins.def               |   2 +
 gcc/config/gcn/gcn.cc                         |  16 ++-
 libgomp/config/gcn/allocator.c                | 127 ++++++++++++++++++
 libgomp/config/gcn/libgomp-gcn.h              |   6 +
 libgomp/config/gcn/team.c                     |  12 ++
 libgomp/libgomp.h                             |   3 -
 libgomp/libgomp.texi                          |  13 ++
 libgomp/plugin/plugin-gcn.c                   |  35 ++++-
 .../testsuite/libgomp.c/omp_alloc-traits.c    |   2 +-
 9 files changed, 205 insertions(+), 11 deletions(-)
 create mode 100644 libgomp/config/gcn/allocator.c
  

Patch

diff --git a/gcc/config/gcn/gcn-builtins.def b/gcc/config/gcn/gcn-builtins.def
index 636a8e7a1a9..471457d7c23 100644
--- a/gcc/config/gcn/gcn-builtins.def
+++ b/gcc/config/gcn/gcn-builtins.def
@@ -164,6 +164,8 @@  DEF_BUILTIN (FIRST_CALL_THIS_THREAD_P, -1, "first_call_this_thread_p", B_INSN,
 	     _A1 (GCN_BTI_BOOL), gcn_expand_builtin_1)
 DEF_BUILTIN (KERNARG_PTR, -1, "kernarg_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR),
 	     gcn_expand_builtin_1)
+DEF_BUILTIN (DISPATCH_PTR, -1, "dispatch_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR),
+	     gcn_expand_builtin_1)
 DEF_BUILTIN (GET_STACK_LIMIT, -1, "get_stack_limit", B_INSN,
 	     _A1 (GCN_BTI_VOIDPTR), gcn_expand_builtin_1)
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 22d2b6ebf6d..d70238820dd 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -110,7 +110,8 @@  gcn_init_machine_status (void)
 
   f = ggc_cleared_alloc<machine_function> ();
 
-  if (TARGET_GCN3)
+  // FIXME: re-enable global addressing with safety for LDS-flat addresses
+  //if (TARGET_GCN3)
     f->use_flat_addressing = true;
 
   return f;
@@ -4881,6 +4882,19 @@  gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
 	  }
 	return ptr;
       }
+    case GCN_BUILTIN_DISPATCH_PTR:
+      {
+	rtx ptr;
+	if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0)
+	   ptr = gen_rtx_REG (DImode,
+			      cfun->machine->args.reg[DISPATCH_PTR_ARG]);
+	else
+	  {
+	    ptr = gen_reg_rtx (DImode);
+	    emit_move_insn (ptr, const0_rtx);
+	  }
+	return ptr;
+      }
     case GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P:
       {
 	/* Stash a marker in the unused upper 16 bits of s[0:1] to indicate
diff --git a/libgomp/config/gcn/allocator.c b/libgomp/config/gcn/allocator.c
new file mode 100644
index 00000000000..e9a95d683f9
--- /dev/null
+++ b/libgomp/config/gcn/allocator.c
@@ -0,0 +1,127 @@ 
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The low-latency allocators use space reserved in LDS memory when the
+   kernel is launched.  The heap is initialized in gomp_gcn_enter_kernel and
+   all allocations are forgotten when the kernel exits.  Allocations to other
+   memory spaces all use the system malloc syscall.
+
+   The pointers returned are 64-bit "Flat" addresses indistinguishable from
+   regular pointers, but only compatible with the "flat_load/store"
+   instructions.  The compiler has been coded to assign default address
+   spaces accordingly.
+
+   LDS memory is not visible to other teams, and therefore may only be used
+   when the memspace access trait is set accordingly.  */
+
+#include "libgomp.h"
+#include <stdlib.h>
+
+#define BASIC_ALLOC_PREFIX __gcn_lowlat
+#define BASIC_ALLOC_YIELD asm ("s_sleep 1" ::: "memory")
+#include "../../basic-allocator.c"
+
+/* The low-latency heap is located in LDS memory, but we need the __flat
+   address space for compatibility reasons.  */
+#define FLAT_HEAP_PTR \
+  ((void *) (uintptr_t) (void __flat *) (void __lds *) GCN_LOWLAT_HEAP)
+
+static void *
+gcn_memspace_alloc (omp_memspace_handle_t memspace, size_t size)
+{
+  if (memspace == omp_low_lat_mem_space)
+    {
+      char *shared_pool = FLAT_HEAP_PTR;
+
+      return __gcn_lowlat_alloc (shared_pool, size);
+    }
+  else
+    return malloc (size);
+}
+
+static void *
+gcn_memspace_calloc (omp_memspace_handle_t memspace, size_t size)
+{
+  if (memspace == omp_low_lat_mem_space)
+    {
+      char *shared_pool = FLAT_HEAP_PTR;
+
+      return __gcn_lowlat_calloc (shared_pool, size);
+    }
+  else
+    return calloc (1, size);
+}
+
+static void
+gcn_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size)
+{
+  if (memspace == omp_low_lat_mem_space)
+    {
+      char *shared_pool = FLAT_HEAP_PTR;
+
+      __gcn_lowlat_free (shared_pool, addr, size);
+    }
+  else
+    free (addr);
+}
+
+static void *
+gcn_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
+			size_t oldsize, size_t size)
+{
+  if (memspace == omp_low_lat_mem_space)
+    {
+      char *shared_pool = FLAT_HEAP_PTR;
+
+      return __gcn_lowlat_realloc (shared_pool, addr, oldsize, size);
+    }
+  else
+    return realloc (addr, size);
+}
+
+static inline int
+gcn_memspace_validate (omp_memspace_handle_t memspace, unsigned access)
+{
+  /* Disallow use of low-latency memory when it must be accessible by
+     all threads.  */
+  return (memspace != omp_low_lat_mem_space
+	  || access != omp_atv_all);
+}
+
+#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
+  gcn_memspace_alloc (MEMSPACE, SIZE)
+#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
+  gcn_memspace_calloc (MEMSPACE, SIZE)
+#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE) \
+  gcn_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
+#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
+  gcn_memspace_free (MEMSPACE, ADDR, SIZE)
+#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
+  gcn_memspace_validate (MEMSPACE, ACCESS)
+
+/* The default low-latency memspace implies omp_atv_all, which is incompatible
+   with the LDS memory space.  */
+#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1
+
+#include "../../allocator.c"
diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h
index f62b7dde0e7..05b6fb60cc9 100644
--- a/libgomp/config/gcn/libgomp-gcn.h
+++ b/libgomp/config/gcn/libgomp-gcn.h
@@ -33,6 +33,12 @@ 
 #define DEFAULT_GCN_STACK_SIZE (32*1024)
 #define DEFAULT_TEAM_ARENA_SIZE (64*1024)
 
+/* These define the LDS location of data needed by OpenMP.  */
+#define TEAM_ARENA_START 16  /* LDS offset of free pointer.  */
+#define TEAM_ARENA_FREE  24  /* LDS offset of free pointer.  */
+#define TEAM_ARENA_END   32  /* LDS offset of end pointer.  */
+#define GCN_LOWLAT_HEAP  40  /* LDS offset of the OpenMP low-latency heap.  */
+
 struct heap
 {
   int64_t size;
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index fb20cbbcf9f..7ee6115b666 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -29,6 +29,12 @@ 
 #include <stdlib.h>
 #include <string.h>
 
+#define LITTLEENDIAN_CPU
+#include "hsa.h"
+
+/* Defined in basic-allocator.c via config/amdgcn/allocator.c.  */
+void __gcn_lowlat_init (void *heap, size_t size);
+
 static void gomp_thread_start (struct gomp_thread_pool *);
 extern void build_indirect_map (void);
 
@@ -75,6 +81,12 @@  gomp_gcn_enter_kernel (void)
       *arena_free = team_arena;
       *arena_end = team_arena + kernargs->arena_size_per_team;
 
+      /* Initialize the low-latency heap.  The header is the size.  */
+      void __lds *lowlat = (void __lds *)GCN_LOWLAT_HEAP;
+      hsa_kernel_dispatch_packet_t *queue_ptr = __builtin_gcn_dispatch_ptr ();
+      __gcn_lowlat_init ((void*)(uintptr_t)(void __flat*)lowlat,
+			 queue_ptr->group_segment_size - GCN_LOWLAT_HEAP);
+
       /* Allocate and initialize the team-local-storage data.  */
       struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
 						      * numthreads);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 15a767cf317..fa29f428976 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -114,9 +114,6 @@  extern void gomp_aligned_free (void *);
 #ifdef __AMDGCN__
 #include "libgomp-gcn.h"
 /* The arena is initialized in config/gcn/team.c.  */
-#define TEAM_ARENA_START 16  /* LDS offset of free pointer.  */
-#define TEAM_ARENA_FREE  24  /* LDS offset of free pointer.  */
-#define TEAM_ARENA_END   32  /* LDS offset of end pointer.  */
 
 static inline void * __attribute__((malloc))
 team_malloc (size_t size)
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 7fdd6fe9410..9d0aee72b33 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -5838,6 +5838,19 @@  The implementation remark:
       available devices (``host fallback'').
 @item The available stack size can be changed using the @code{GCN_STACK_SIZE}
       environment variable; the default is 32 kiB per thread.
+@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
+      the @code{access} trait is set to @code{cgroup}.  The default pool size
+      is automatically scaled to share the 64 kiB LDS memory between the number
+      of teams configured to run on each compute-unit, but may be adjusted at
+      runtime by setting environment variable
+      @code{GOMP_GCN_LOWLAT_POOL=@var{bytes}}.
+@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory
+      because the definition implies the @code{omp_atv_all} trait; main
+      graphics memory is used instead.
+@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and
+      @code{omp_thread_mem_alloc}, all use low-latency memory as first
+      preference, and fall back to main graphics memory when the low-latency
+      pool is exhausted.
 @end itemize
 
 
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 8aabbd99881..7f8178c78b7 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -550,6 +550,7 @@  static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 
 static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
 static int stack_size = DEFAULT_GCN_STACK_SIZE;
+static int lowlat_size = -1;
 
 /* Flag to decide whether print to stderr information about what is going on.
    Set in init_debug depending on environment variables.  */
@@ -1016,8 +1017,8 @@  print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
 	   dispatch->private_segment_size);
-  fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
-	   dispatch->group_segment_size);
+  fprintf (stderr, "%*sgroup_segment_size: %u (low-latency pool)\n", indent,
+	   "", dispatch->group_segment_size);
   fprintf (stderr, "\n");
 }
 
@@ -1088,6 +1089,10 @@  init_environment_variables (void)
       if (tmp)
 	stack_size = tmp;;
     }
+
+  const char *lowlat = secure_getenv ("GOMP_GCN_LOWLAT_POOL");
+  if (lowlat)
+    lowlat_size = atoi (lowlat);
 }
 
 /* Return malloc'd string with name of SYMBOL.  */
@@ -1930,7 +1935,25 @@  create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
 
   shadow->signal = sync_signal.handle;
   shadow->private_segment_size = kernel->private_segment_size;
-  shadow->group_segment_size = kernel->group_segment_size;
+
+  if (lowlat_size < 0)
+    {
+      /* Divide the LDS between the number of running teams.
+	 Allocate not less than is defined in the kernel metadata.  */
+      int teams_per_cu = num_teams / get_cu_count (agent);
+      int LDS_per_team = (teams_per_cu ? 65536 / teams_per_cu : 65536);
+      shadow->group_segment_size
+	= (kernel->group_segment_size > LDS_per_team
+	   ? kernel->group_segment_size
+	   : LDS_per_team);;
+    }
+  else if (lowlat_size < GCN_LOWLAT_HEAP+8)
+    /* Ensure that there's space for the OpenMP libgomp data.  */
+    shadow->group_segment_size = GCN_LOWLAT_HEAP+8;
+  else
+    shadow->group_segment_size = (lowlat_size > 65536
+				  ? 65536
+				  : lowlat_size);
 
   /* We expect kernels to request a single pointer, explicitly, and the
      rest of struct kernargs, implicitly.  If they request anything else
@@ -2290,9 +2313,9 @@  run_kernel (struct kernel_info *kernel, void *vars,
       print_kernel_dispatch (shadow, 2);
     }
 
-  packet->private_segment_size = kernel->private_segment_size;
-  packet->group_segment_size = kernel->group_segment_size;
-  packet->kernel_object = kernel->object;
+  packet->private_segment_size = shadow->private_segment_size;
+  packet->group_segment_size = shadow->group_segment_size;
+  packet->kernel_object = shadow->object;
   packet->kernarg_address = shadow->kernarg_address;
   hsa_signal_t s;
   s.handle = shadow->signal;
diff --git a/libgomp/testsuite/libgomp.c/omp_alloc-traits.c b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c
index 4ff0fca4986..e9acc8673a3 100644
--- a/libgomp/testsuite/libgomp.c/omp_alloc-traits.c
+++ b/libgomp/testsuite/libgomp.c/omp_alloc-traits.c
@@ -1,7 +1,7 @@ 
 /* { dg-do run } */
 
 /* { dg-require-effective-target offload_device } */
-/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */
+/* { dg-xfail-if "not implemented" { ! { offload_target_nvptx || offload_target_amdgcn } } } */
 
 /* Test that GPU low-latency allocation is limited to team access.  */