[committed] amdgcn: Add gfx90a support

Message ID 4447d081-0c3f-8397-1b8e-7963895f400a@codesourcery.com
State Committed
Headers
Series [committed] amdgcn: Add gfx90a support |

Commit Message

Andrew Stubbs May 24, 2022, 3:31 p.m. UTC
  I've committed this patch to add support for gfx90a AMD GPU devices.

The patch updates all the places that have architecture/ISA specific 
code, tidies up the ISA naming and handling in the backend, and adds a 
new multilib.

This is just lightly tested at this point, but there are no known issues 
and it shouldn't break anything for other architectures.

Andrew
amdgcn: Add gfx90a support

This adds architecture options and multilibs for the AMD GFX90a GPUs.
It also tidies up some of the ISA selection code, and corrects a few small
mistake in the gfx908 naming.

gcc/ChangeLog:

	* config.gcc (amdgcn): Accept --with-arch=gfx908 and gfx90a.
	* config/gcn/gcn-opts.h (enum gcn_isa): New.
	(TARGET_GCN3): Use enum gcn_isa.
	(TARGET_GCN3_PLUS): Likewise.
	(TARGET_GCN5): Likewise.
	(TARGET_GCN5_PLUS): Likewise.
	(TARGET_CDNA1): New.
	(TARGET_CDNA1_PLUS): New.
	(TARGET_CDNA2): New.
	(TARGET_CDNA2_PLUS): New.
	(TARGET_M0_LDS_LIMIT): New.
	(TARGET_PACKED_WORK_ITEMS): New.
	* config/gcn/gcn.cc (gcn_isa): Change to enum gcn_isa.
	(gcn_option_override): Recognise CDNA ISA variants.
	(gcn_omp_device_kind_arch_isa): Support gfx90a.
	(gcn_expand_prologue): Make m0 init optional.
	Add support for packed work items.
	(output_file_start): Support gfx90a.
	(gcn_hsa_declare_function_name): Support gfx90a metadata.
	* config/gcn/gcn.h (TARGET_CPU_CPP_BUILTINS):Add __CDNA1__ and
	__CDNA2__.
	* config/gcn/gcn.md (<su>mulsi3_highpart): Use TARGET_GCN5_PLUS.
	(<su>mulsi3_highpart_imm): Likewise.
	(<su>mulsidi3): Likewise.
	(<su>mulsidi3_imm): Likewise.
	* config/gcn/gcn.opt (gpu_type): Add gfx90a.
	* config/gcn/mkoffload.cc (EF_AMDGPU_MACH_AMDGCN_GFX90a): New.
	(main): Support gfx90a.
	* config/gcn/t-gcn-hsa: Add gfx90a multilib.
	* config/gcn/t-omp-device: Add gfx90a isa.

libgomp/ChangeLog:

	* plugin/plugin-gcn.c (EF_AMDGPU_MACH): Add
	EF_AMDGPU_MACH_AMDGCN_GFX90a.
	(gcn_gfx90a_s): New.
	(isa_hsa_name): Support gfx90a.
	(isa_code): Likewise.
  

Patch

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 600ac357366..cdbefb5b4f5 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -4522,7 +4522,7 @@  case "${target}" in
 		for which in arch tune; do
 			eval "val=\$with_$which"
 			case ${val} in
-			"" | fiji | gfx900 | gfx906 )
+			"" | fiji | gfx900 | gfx906 | gfx908 | gfx90a)
 				# OK
 				;;
 			*)
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index c0805241bc5..b62dfb45f59 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -23,16 +23,30 @@  enum processor_type
   PROCESSOR_FIJI,    // gfx803
   PROCESSOR_VEGA10,  // gfx900
   PROCESSOR_VEGA20,  // gfx906
-  PROCESSOR_GFX908   // as yet unnamed
+  PROCESSOR_GFX908,
+  PROCESSOR_GFX90a
 };
 
 /* Set in gcn_option_override.  */
-extern int gcn_isa;
-
-#define TARGET_GCN3 (gcn_isa == 3)
-#define TARGET_GCN3_PLUS (gcn_isa >= 3)
-#define TARGET_GCN5 (gcn_isa == 5)
-#define TARGET_GCN5_PLUS (gcn_isa >= 5)
+extern enum gcn_isa {
+  ISA_UNKNOWN,
+  ISA_GCN3,
+  ISA_GCN5,
+  ISA_CDNA1,
+  ISA_CDNA2
+} gcn_isa;
+
+#define TARGET_GCN3 (gcn_isa == ISA_GCN3)
+#define TARGET_GCN3_PLUS (gcn_isa >= ISA_GCN3)
+#define TARGET_GCN5 (gcn_isa == ISA_GCN5)
+#define TARGET_GCN5_PLUS (gcn_isa >= ISA_GCN5)
+#define TARGET_CDNA1 (gcn_isa == ISA_CDNA1)
+#define TARGET_CDNA1_PLUS (gcn_isa >= ISA_CDNA1)
+#define TARGET_CDNA2 (gcn_isa == ISA_CDNA2)
+#define TARGET_CDNA2_PLUS (gcn_isa >= ISA_CDNA2)
+
+#define TARGET_M0_LDS_LIMIT (TARGET_GCN3)
+#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS)
 
 enum sram_ecc_type
 {
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 39a7a966502..5e75a1b63aa 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -66,7 +66,7 @@  static bool ext_gcn_constants_init = 0;
 
 /* Holds the ISA variant, derived from the command line parameters.  */
 
-int gcn_isa = 3;		/* Default to GCN3.  */
+enum gcn_isa gcn_isa = ISA_GCN3;	/* Default to GCN3.  */
 
 /* Reserve this much space for LDS (for propagating variables from
    worker-single mode to worker-partitioned mode), per workgroup.  Global
@@ -129,7 +129,13 @@  gcn_option_override (void)
   if (!flag_pic)
     flag_pic = flag_pie;
 
-  gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
+  gcn_isa = (gcn_arch == PROCESSOR_FIJI ? ISA_GCN3
+      : gcn_arch == PROCESSOR_VEGA10 ? ISA_GCN5
+      : gcn_arch == PROCESSOR_VEGA20 ? ISA_GCN5
+      : gcn_arch == PROCESSOR_GFX908 ? ISA_CDNA1
+      : gcn_arch == PROCESSOR_GFX90a ? ISA_CDNA2
+      : ISA_UNKNOWN);
+  gcc_assert (gcn_isa != ISA_UNKNOWN);
 
   /* The default stack size needs to be small for offload kernels because
      there may be many, many threads.  Also, a smaller stack gives a
@@ -2642,6 +2648,8 @@  gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
 	return gcn_arch == PROCESSOR_VEGA20;
       if (strcmp (name, "gfx908") == 0)
 	return gcn_arch == PROCESSOR_GFX908;
+      if (strcmp (name, "gfx90a") == 0)
+	return gcn_arch == PROCESSOR_GFX90a;
       return 0;
     default:
       gcc_unreachable ();
@@ -3081,13 +3089,35 @@  gcn_expand_prologue ()
   /* Ensure that the scheduler doesn't do anything unexpected.  */
   emit_insn (gen_blockage ());
 
-  /* m0 is initialized for the usual LDS DS and FLAT memory case.
-     The low-part is the address of the topmost addressable byte, which is
-     size-1.  The high-part is an offset and should be zero.  */
-  emit_move_insn (gen_rtx_REG (SImode, M0_REG),
-		  gen_int_mode (LDS_SIZE, SImode));
+  if (TARGET_M0_LDS_LIMIT)
+  {
+    /* m0 is initialized for the usual LDS DS and FLAT memory case.
+       The low-part is the address of the topmost addressable byte, which is
+       size-1.  The high-part is an offset and should be zero.  */
+    emit_move_insn (gen_rtx_REG (SImode, M0_REG),
+	gen_int_mode (LDS_SIZE, SImode));
+
+    emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
+  }
 
-  emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
+  if (TARGET_PACKED_WORK_ITEMS
+      && cfun && cfun->machine && !cfun->machine->normal_function)
+  {
+    /* v0 conatins the X, Y and Z dimensions all in one.
+       Expand them out for ABI compatibility.  */
+    /* TODO: implement and use zero_extract.  */
+    rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+    emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+	       gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
+    emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
+    emit_insn (gen_prologue_use (v1));
+
+    rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
+    emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+	       gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
+    emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
+    emit_insn (gen_prologue_use (v2));
+  }
 
   if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
     {
@@ -5243,6 +5273,9 @@  output_file_start (void)
     case PROCESSOR_GFX908:
       cpu = "gfx908";
       break;
+    case PROCESSOR_GFX90a:
+      cpu = "gfx90a";
+      break;
     default: gcc_unreachable ();
     }
 
@@ -5296,6 +5329,10 @@  gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	sgpr = MAX_NORMAL_SGPR_COUNT;
     }
 
+  /* The gfx90a accum_offset field can't represent 0 registers.  */
+  if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4)
+    vgpr = 4;
+
   fputs ("\t.rodata\n"
 	 "\t.p2align\t6\n"
 	 "\t.amdhsa_kernel\t", file);
@@ -5364,6 +5401,11 @@  gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	      one 64th the wave-front stack size.  */
 	   stack_size_opt / 64,
 	   LDS_SIZE);
+  if (gcn_arch == PROCESSOR_GFX90a)
+    fprintf (file,
+	     "\t  .amdhsa_accum_offset\t%i\n"
+	     "\t  .amdhsa_tg_split\t0\n",
+	     (vgpr+3)&~3); // I think this means the AGPRs come after the VGPRs
   fputs ("\t.end_amdhsa_kernel\n", file);
 
 #if 1
@@ -5392,6 +5434,8 @@  gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	   LDS_SIZE,
 	   stack_size_opt / 64,
 	   sgpr, vgpr);
+  if (gcn_arch == PROCESSOR_GFX90a)
+    fprintf (file, "            .agpr_count: 0\n"); // AGPRs are not used, yet
   fputs ("        .end_amdgpu_metadata\n", file);
 #endif
 
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h
index 9ae8919f5f8..a1297605047 100644
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -24,6 +24,10 @@ 
 	builtin_define ("__GCN3__");	\
       else if (TARGET_GCN5)		\
 	builtin_define ("__GCN5__");	\
+      else if (TARGET_CDNA1)		\
+	builtin_define ("__CDNA1__");	\
+      else if (TARGET_CDNA2)		\
+	builtin_define ("__CDNA2__");	\
     }					\
   while(0)
 
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 21a74764a27..53e846e15d1 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -1410,7 +1410,7 @@  (define_expand "<su>mulsi3_highpart"
   ""
 {
   if (can_create_pseudo_p ()
-      && !TARGET_GCN5
+      && !TARGET_GCN5_PLUS
       && !gcn_inline_immediate_operand (operands[2], SImode))
     operands[2] = force_reg (SImode, operands[2]);
 
@@ -1451,7 +1451,7 @@  (define_insn "<su>mulsi3_highpart_imm"
 		(match_operand:SI 1 "register_operand"         "Sg,Sg,v"))
 	      (match_operand:DI 2 "gcn_32bit_immediate_operand" "A, B,A"))
 	    (const_int 32))))]
-  "TARGET_GCN5 || gcn_inline_immediate_operand (operands[2], SImode)"
+  "TARGET_GCN5_PLUS || gcn_inline_immediate_operand (operands[2], SImode)"
   "@
   s_mul_hi<sgnsuffix>0\t%0, %1, %2
   s_mul_hi<sgnsuffix>0\t%0, %1, %2
@@ -1469,7 +1469,7 @@  (define_expand "<su>mulsidi3"
   ""
 {
   if (can_create_pseudo_p ()
-      && !TARGET_GCN5
+      && !TARGET_GCN5_PLUS
       && !gcn_inline_immediate_operand (operands[2], SImode))
     operands[2] = force_reg (SImode, operands[2]);
 
@@ -1506,7 +1506,7 @@  (define_insn_and_split "<su>mulsidi3_imm"
 		   (match_operand:SI 1 "register_operand"       "Sg, Sg, v"))
 		 (match_operand:DI 2 "gcn_32bit_immediate_operand"
 								 "A,  B, A")))]
-  "TARGET_GCN5 || gcn_inline_immediate_operand (operands[2], SImode)"
+  "TARGET_GCN5_PLUS || gcn_inline_immediate_operand (operands[2], SImode)"
   "#"
   "&& reload_completed"
   [(const_int 0)]
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index 54da11f4bd9..9606aaf0b1a 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -37,6 +37,9 @@  Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA20)
 EnumValue
 Enum(gpu_type) String(gfx908) Value(PROCESSOR_GFX908)
 
+EnumValue
+Enum(gpu_type) String(gfx90a) Value(PROCESSOR_GFX90a)
+
 march=
 Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_FIJI)
 Specify the name of the target GPU.
diff --git a/gcc/config/gcn/mkoffload.cc b/gcc/config/gcn/mkoffload.cc
index e98277c412b..ed93ae844e4 100644
--- a/gcc/config/gcn/mkoffload.cc
+++ b/gcc/config/gcn/mkoffload.cc
@@ -55,6 +55,8 @@ 
 #define EF_AMDGPU_MACH_AMDGCN_GFX906 0x2f
 #undef  EF_AMDGPU_MACH_AMDGCN_GFX908
 #define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30
+#undef  EF_AMDGPU_MACH_AMDGCN_GFX90a
+#define EF_AMDGPU_MACH_AMDGCN_GFX90a 0x3f
 
 #define EF_AMDGPU_FEATURE_XNACK_V4	0x300  /* Mask.  */
 #define EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4	0x000
@@ -904,6 +906,8 @@  main (int argc, char **argv)
 	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX906;
       else if (strcmp (argv[i], "-march=gfx908") == 0)
 	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX908;
+      else if (strcmp (argv[i], "-march=gfx90a") == 0)
+	elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX90a;
     }
 
   if (!(fopenacc ^ fopenmp))
diff --git a/gcc/config/gcn/t-gcn-hsa b/gcc/config/gcn/t-gcn-hsa
index 10e31f3d266..9e03ec89ec2 100644
--- a/gcc/config/gcn/t-gcn-hsa
+++ b/gcc/config/gcn/t-gcn-hsa
@@ -42,8 +42,8 @@  ALL_HOST_OBJS += gcn-run.o
 gcn-run$(exeext): gcn-run.o
 	+$(LINKER) $(ALL_LINKERFLAGS) $(LDFLAGS) -o $@ $< -ldl
 
-MULTILIB_OPTIONS = march=gfx900/march=gfx906/march=gfx908
-MULTILIB_DIRNAMES = gfx900 gfx906 gfx908
+MULTILIB_OPTIONS = march=gfx900/march=gfx906/march=gfx908/march=gfx90a
+MULTILIB_DIRNAMES = gfx900 gfx906 gfx908 gfx90a
 
 gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.cc
 	$(COMPILE) $<
diff --git a/gcc/config/gcn/t-omp-device b/gcc/config/gcn/t-omp-device
index e1d9e0d2a1e..27d36db894b 100644
--- a/gcc/config/gcn/t-omp-device
+++ b/gcc/config/gcn/t-omp-device
@@ -1,4 +1,4 @@ 
 omp-device-properties-gcn: $(srcdir)/config/gcn/gcn.cc
 	echo kind: gpu > $@
 	echo arch: amdgcn gcn >> $@
-	echo isa: fiji gfx900 gfx906 gfx908 >> $@
+	echo isa: fiji gfx900 gfx906 gfx908 gfx90a >> $@
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 2b32f5352c8..1c0436842da 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -402,7 +402,8 @@  typedef enum {
   EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
   EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
-  EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030
+  EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
+  EF_AMDGPU_MACH_AMDGCN_GFX90a = 0x03f
 } EF_AMDGPU_MACH;
 
 const static int EF_AMDGPU_MACH_MASK = 0x000000ff;
@@ -1628,6 +1629,7 @@  const static char *gcn_gfx803_s = "gfx803";
 const static char *gcn_gfx900_s = "gfx900";
 const static char *gcn_gfx906_s = "gfx906";
 const static char *gcn_gfx908_s = "gfx908";
+const static char *gcn_gfx90a_s = "gfx90a";
 const static int gcn_isa_name_len = 6;
 
 /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not
@@ -1645,6 +1647,8 @@  isa_hsa_name (int isa) {
       return gcn_gfx906_s;
     case EF_AMDGPU_MACH_AMDGCN_GFX908:
       return gcn_gfx908_s;
+    case EF_AMDGPU_MACH_AMDGCN_GFX90a:
+      return gcn_gfx90a_s;
     }
   return NULL;
 }
@@ -1681,6 +1685,9 @@  isa_code(const char *isa) {
   if (!strncmp (isa, gcn_gfx908_s, gcn_isa_name_len))
     return EF_AMDGPU_MACH_AMDGCN_GFX908;
 
+  if (!strncmp (isa, gcn_gfx90a_s, gcn_isa_name_len))
+    return EF_AMDGPU_MACH_AMDGCN_GFX90a;
+
   return -1;
 }