[3/8,RFC] Tie the new atomic builtins to the backend

Message ID 20240919131204.3865854-4-mmalcomson@nvidia.com
State New
Headers
Series Introduce floating point fetch_add builtins |

Commit Message

Matthew Malcomson Sept. 19, 2024, 1:11 p.m. UTC
  From: Matthew Malcomson <mmalcomson@nvidia.com>

Need to implement something in the

Things implemented in this patch:
1) Update the optabs definitions to include floating point versions of
   atomic fetch_add variants.
2) When expanding into a CAS loop in RTL because the floating point
   optab is not implemented, there are now two different modes.  One is
   the integral mode in which the atomic CAS (and load) should be
   performed, and one is the floating point mode in which the operation
   should be performed.
   - Extra handling of modes etc in `expand_atomic_fetch_op`.

Things to highlight to any reviewer:
1) Needed another mapping from builtin to mode.
   This is *almost* shared code between this and the frontend.  Looks
   like this could be shared if I put some effort into it.
2) I do not always expand into the modify before version, but also use
   the modify after version when unable to inline.
   - From looking at the dates on different parts of the code, it seems
     that this used to be needed before libatomic was added as a target
     library.  Since libatomic currently implements both the fetch_<op>
     and <op>_fetch versions I don't believe it's needed any more.
3) I `extract_bit_field` to convert between representations when
   expanding as a fallback (because fallback CAS loop loads in integral
   register and we want to reinterpret that as a floating point type
   before the intermediate operation).
   - Not sure if there's a better way I don't know about.

Other than that everything seems mostly straight-forwardly following
what is already done.

Signed-off-by: Matthew Malcomson <mmalcomson@nvidia.com>
---
 gcc/builtins.cc | 153 +++++++++++++++++++++++++++++++++++++++++++++---
 gcc/optabs.cc   | 101 ++++++++++++++++++++++++++++----
 gcc/optabs.def  |   6 +-
 gcc/optabs.h    |   2 +-
 4 files changed, 241 insertions(+), 21 deletions(-)
  

Patch

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 0b902896ddd..0ffd7d0b211 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -6394,6 +6394,46 @@  get_builtin_sync_mode (int fcode_diff)
   return int_mode_for_size (BITS_PER_UNIT << fcode_diff, 0).require ();
 }
 
+/* Reconsitute the machine modes relevant for this builtin operation from the
+   builtin difference from the _N version of a fetch_add atomic.
+
+   Only works for floating point atomic builtins.
+   FCODE_DIFF should be fcode - base, where base is the FOO_N code for the
+   group of builtins.  N.b. this is a different base to that used by
+   `get_builtin_sync_mode` because that matches the builtin enum offset used in
+   c-common.cc to find the builtin enum from a given MODE.
+
+   TODO Really do need to figure out a bit neater code here.  Should not be
+   inlining the mapping from type to offset in two different places.  */
+static inline machine_mode
+get_builtin_fp_sync_mode (int fcode_diff, machine_mode *mode)
+{
+  struct type_to_offset { tree type; size_t offset; };
+  static const struct type_to_offset fp_type_mappings[] = {
+    { float_type_node, 6 },
+    { double_type_node, 7 },
+    { long_double_type_node, 8 },
+    { bfloat16_type_node ? bfloat16_type_node : error_mark_node, 9 },
+    { float16_type_node ? float16_type_node : error_mark_node, 10 },
+    { float32_type_node ? float32_type_node : error_mark_node, 11 },
+    { float64_type_node ? float64_type_node : error_mark_node, 12 },
+    { float128_type_node ? float128_type_node : error_mark_node, 13 },
+    { float32x_type_node ? float32x_type_node : error_mark_node, 14 },
+    { float64x_type_node ? float64x_type_node : error_mark_node, 15 }
+  };
+ gcc_assert (fcode_diff <= 15 && fcode_diff >= 6);
+ for (size_t i = 0; i < sizeof(fp_type_mappings)/sizeof(fp_type_mappings[0]); i++)
+   {
+     if ((size_t)fcode_diff == fp_type_mappings[i].offset)
+       {
+	 *mode = TYPE_MODE (fp_type_mappings[i].type);
+	 return int_mode_for_size (GET_MODE_SIZE (*mode) * BITS_PER_UNIT, 0)
+	   .require ();
+       }
+  } 
+ gcc_unreachable ();
+}
+
 /* Expand the memory expression LOC and return the appropriate memory operand
    for the builtin_sync operations.  */
 
@@ -6886,9 +6926,10 @@  expand_builtin_atomic_store (machine_mode mode, tree exp)
    resolved to an instruction sequence.  */
 
 static rtx
-expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
+expand_builtin_atomic_fetch_op (machine_mode expand_mode, tree exp, rtx target,
 				enum rtx_code code, bool fetch_after,
-				bool ignore, enum built_in_function ext_call)
+				bool ignore, enum built_in_function ext_call,
+				machine_mode load_mode = VOIDmode)
 {
   rtx val, mem, ret;
   enum memmodel model;
@@ -6898,13 +6939,13 @@  expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
   model = get_memmodel (CALL_EXPR_ARG (exp, 2));
 
   /* Expand the operands.  */
-  mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), mode);
-  val = expand_expr_force_mode (CALL_EXPR_ARG (exp, 1), mode);
+  mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), expand_mode);
+  val = expand_expr_force_mode (CALL_EXPR_ARG (exp, 1), expand_mode);
 
   /* Only try generating instructions if inlining is turned on.  */
   if (flag_inline_atomics)
     {
-      ret = expand_atomic_fetch_op (target, mem, val, code, model, fetch_after);
+      ret = expand_atomic_fetch_op (target, mem, val, code, model, fetch_after, load_mode);
       if (ret)
 	return ret;
     }
@@ -6938,12 +6979,12 @@  expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
     {
       if (code == NOT)
 	{
-	  ret = expand_simple_binop (mode, AND, ret, val, NULL_RTX, true,
+	  ret = expand_simple_binop (expand_mode, AND, ret, val, NULL_RTX, true,
 				     OPTAB_LIB_WIDEN);
-	  ret = expand_simple_unop (mode, NOT, ret, target, true);
+	  ret = expand_simple_unop (expand_mode, NOT, ret, target, true);
 	}
       else
-	ret = expand_simple_binop (mode, code, ret, val, target, true,
+	ret = expand_simple_binop (expand_mode, code, ret, val, target, true,
 				   OPTAB_LIB_WIDEN);
     }
   return ret;
@@ -8779,7 +8820,7 @@  expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       if (target)
 	return target;
       break;
- 
+
     case BUILT_IN_ATOMIC_FETCH_SUB_1:
     case BUILT_IN_ATOMIC_FETCH_SUB_2:
     case BUILT_IN_ATOMIC_FETCH_SUB_4:
@@ -8840,6 +8881,100 @@  expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
 	return target;
       break;
 
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FP:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPL:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF16B:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF16:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF32:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF64:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF128:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF32X:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF64X:
+      {
+      machine_mode int_mode
+	= get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_FETCH_ADD_N, &mode);
+      target = expand_builtin_atomic_fetch_op (mode, exp, target, PLUS, false,
+					       ignore, BUILT_IN_NONE, int_mode);
+      if (target)
+	return target;
+      break;
+      }
+
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FP:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPL:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF16B:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF16:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF32:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF64:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF128:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF32X:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF64X:
+      {
+      /* TODO I don't translate to the FETCH_ADD library call if this fails
+	 to inline.  The integral ADD_FETCH versions of atomic functions do.
+	 I don't understand why they make that transformation, could *guess*
+	 that it's the more likely function to be implemented except that
+	 libatomic seems to implement everything if it implements anything.
+	 -- Any explanation why the integral versions make this translation
+	 (and hence help with whether these floating point versions should make
+	 that translation) would be welcomed.
+
+	 A comment in gcc.dg/atomic-noinline.c seems to imply that such a
+	 translation was necessary at one point.  That comment was added to the
+	 testsuite file before the introduction of libatomic to the GCC target
+	 library.  I guess this was something needed in an earlier state of the
+	 ecosystem.  */
+      machine_mode int_mode
+	= get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_ADD_FETCH_N, &mode);
+      target = expand_builtin_atomic_fetch_op (mode, exp, target, PLUS, true,
+					       ignore, BUILT_IN_NONE, int_mode);
+      if (target)
+	return target;
+      break;
+      }
+
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FP:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPL:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF16B:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF16:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF32:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF64:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF128:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF32X:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF64X:
+      {
+      machine_mode int_mode
+	= get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_FETCH_SUB_N, &mode);
+      target = expand_builtin_atomic_fetch_op (mode, exp, target, MINUS, false,
+					       ignore, BUILT_IN_NONE, int_mode);
+      if (target)
+	return target;
+      break;
+      }
+
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FP:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPL:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF16B:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF16:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF32:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF64:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF128:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF32X:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF64X:
+      {
+      machine_mode int_mode
+	= get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_SUB_FETCH_N, &mode);
+      target = expand_builtin_atomic_fetch_op (mode, exp, target, MINUS, true,
+					       ignore, BUILT_IN_NONE, int_mode);
+      if (target)
+	return target;
+      break;
+      }
+
     case BUILT_IN_ATOMIC_TEST_AND_SET:
       target = expand_builtin_atomic_test_and_set (exp, target);
       if (target)
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 185c5b1a705..ca395dde89b 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -7745,6 +7745,10 @@  expand_atomic_fetch_op_no_fallback (rtx target, rtx mem, rtx val,
   if (result)
     return result;
 
+  /* TODO For floating point is there anything extra to worry about
+     w.r.t. rounding (i.e. is X+<some constant> guaranteed to be equal
+     to X-(-1 * <some constant>)).
+     Doubt it is, but wouldn't want to avoid the operation on a hunch.  */
   /* If the fetch value can be calculated from the other variation of fetch,
      try that operation.  */
   if (after || unused_result || optab.reverse_code != UNKNOWN)
@@ -7793,7 +7797,8 @@  expand_atomic_fetch_op_no_fallback (rtx target, rtx mem, rtx val,
    AFTER is false to return the value before the operation (fetch_OP).  */
 rtx
 expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
-			enum memmodel model, bool after)
+			enum memmodel model, bool after,
+			machine_mode load_mode)
 {
   machine_mode mode = GET_MODE (mem);
   rtx result;
@@ -7802,7 +7807,7 @@  expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
   /* If loads are not atomic for the required size and we are not called to
      provide a __sync builtin, do not do anything so that we stay consistent
      with atomic loads of the same size.  */
-  if (!can_atomic_load_p (mode) && !is_mm_sync (model))
+  if (!can_atomic_load_p (load_mode) && !is_mm_sync (model))
     return NULL_RTX;
 
   result = expand_atomic_fetch_op_no_fallback (target, mem, val, code, model,
@@ -7817,6 +7822,13 @@  expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
       rtx tmp;
       enum rtx_code reverse = (code == PLUS ? MINUS : PLUS);
 
+      /* TODO Need to double-check whether there's any floating point problems
+	 with doing the reverse operation on a negated value.
+	 (Don't know of any particular problem -- just have this feeling that
+	 floating point transformations are tricky).
+
+	 FWIW I have the impression this is fine because GCC optimizes x + (-y)
+	 to x - y for floating point values.  */
       start_sequence ();
       tmp = expand_simple_unop (mode, NEG, val, NULL_RTX, true);
       result = expand_atomic_fetch_op_no_fallback (target, mem, tmp, reverse,
@@ -7835,7 +7847,7 @@  expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
     }
 
   /* Try the __sync libcalls only if we can't do compare-and-swap inline.  */
-  if (!can_compare_and_swap_p (mode, false))
+  if (!can_compare_and_swap_p (load_mode, false))
     {
       rtx libfunc;
       bool fixup = false;
@@ -7870,11 +7882,41 @@  expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
       code = orig_code;
     }
 
-  /* If nothing else has succeeded, default to a compare and swap loop.  */
-  if (can_compare_and_swap_p (mode, true))
+    /* If nothing else has succeeded, default to a compare and swap loop.
+
+       N.b. for modes where the compare and swap has to happen in a different
+       mode to the operation we have to do a conversion in between the integral
+       value that the CAS loop is going to be using and the other mode that our
+       operations are performed in.  This happens when modes where
+       load_mode != mode, e.g. where `mode` is a floating point mode and
+       `load_mode` is an integral one.  */
+  if (can_compare_and_swap_p (load_mode, true))
     {
+      /* Should have been ensured by the caller, but nice to make sure.  */
+      gcc_assert (known_eq (GET_MODE_SIZE (mode), GET_MODE_SIZE (load_mode)));
+      poly_uint64 loadmode_bitsize = GET_MODE_SIZE (load_mode) * BITS_PER_UNIT;
       rtx_insn *insn;
-      rtx t0 = gen_reg_rtx (mode), t1;
+      rtx t0 = gen_reg_rtx (load_mode), t1;
+      rtx tmp = gen_reg_rtx (mode);
+      /* TODO Is there a better way than this to convert between
+	 interpretations?   We need bitwise interpretation because the atomic
+	 memory operations are being performed on an integral register.  */
+      auto interpret_as_float =
+	[loadmode_bitsize, mode] (rtx target, rtx irtx) -> rtx {
+	rtx tmp = extract_bit_field (irtx, loadmode_bitsize, 0, true, target,
+				     mode, mode, false, NULL);
+	if (tmp != target)
+	  emit_move_insn (target, tmp);
+	return target;
+      };
+      auto interpret_as_int
+	= [loadmode_bitsize, load_mode] (rtx target, rtx frtx) -> rtx {
+	rtx tmp = extract_bit_field (frtx, loadmode_bitsize, 0, true, target,
+				     load_mode, load_mode, false, NULL);
+	if (tmp != target)
+	  emit_move_insn (target, tmp);
+	return target;
+      };
 
       start_sequence ();
 
@@ -7885,7 +7927,12 @@  expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
 	    target = gen_reg_rtx (mode);
 	  /* If fetch_before, copy the value now.  */
 	  if (!after)
-	    emit_move_insn (target, t0);
+	    {
+	      if (load_mode == mode)
+		emit_move_insn (target, t0);
+	      else
+		interpret_as_float (target, t0);
+	    }
 	}
       else
         target = const0_rtx;
@@ -7897,18 +7944,52 @@  expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
 				    true, OPTAB_LIB_WIDEN);
 	  t1 = expand_simple_unop (mode, code, t1, NULL_RTX, true);
 	}
-      else
+      else if (load_mode == mode)
 	t1 = expand_simple_binop (mode, code, t1, val, NULL_RTX, true, 
 				  OPTAB_LIB_WIDEN);
+      else
+	{
+	  interpret_as_float (tmp, t1);
+	  tmp = expand_simple_binop (mode, code, tmp, val, NULL_RTX, true,
+				     OPTAB_LIB_WIDEN);
+	  t1 = gen_reg_rtx (load_mode);
+	  interpret_as_int (t1, tmp);
+	}
 
       /* For after, copy the value now.  */
       if (!unused_result && after)
-        emit_move_insn (target, t1);
+        emit_move_insn (target, load_mode == mode ? t1 : tmp);
       insn = get_insns ();
       end_sequence ();
 
+      /*
+	Outside `expand_compare_and_swap_loop` (i.e. inside the `seq`) I've
+	done the following:
+	    tmp (floating) = old_reg (integral)
+	    tmp += 1
+	    new_reg (integral) = tmp (floating)
+	`expand_compare_and_swap_loop` wraps the sequence it's given as
+	described at the top of its implementation.
+	  cmp_reg = mem
+	label:
+	  old_reg = cmp_reg;
+	  tmp (floating) = old_reg (integral)
+	  tmp += 1;
+	  new_reg (integral) = tmp (floating)
+	  (success, cmp_reg) = CAS(mem, old_reg, new_reg)
+	  if (success)
+	    goto label;
+
+	In order to implement this what we want is to expand the MEM as an
+	integral value before passing into this function.  Then this function
+	would not have to understand anything about the fact that the inner
+	thing is a floating point operation.
+	  - N.b. there is the question of whether we'd like a conversion
+	    inside or outside the loop.  I don't think it matters TBH, though
+	    could easily be missing something here.  */
+      mem = adjust_address (mem, load_mode, 0);
       if (t1 != NULL && expand_compare_and_swap_loop (mem, t0, t1, insn))
-        return target;
+	return target;
     }
 
   return NULL_RTX;
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 45e117a7f50..a450c4bba81 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -503,6 +503,7 @@  OPTAB_D (sync_sub_optab, "sync_sub$I$a")
 OPTAB_D (sync_xor_optab, "sync_xor$I$a")
 
 OPTAB_D (atomic_add_fetch_optab, "atomic_add_fetch$I$a")
+OPTAB_NX (atomic_add_fetch_optab, "atomic_add_fetch$F$a")
 OPTAB_D (atomic_add_optab, "atomic_add$I$a")
 OPTAB_D (atomic_and_fetch_optab, "atomic_and_fetch$I$a")
 OPTAB_D (atomic_and_optab, "atomic_and$I$a")
@@ -511,11 +512,13 @@  OPTAB_D (atomic_bit_test_and_complement_optab, "atomic_bit_test_and_complement$I
 OPTAB_D (atomic_bit_test_and_reset_optab, "atomic_bit_test_and_reset$I$a")
 OPTAB_D (atomic_compare_and_swap_optab, "atomic_compare_and_swap$I$a")
 OPTAB_D (atomic_exchange_optab,	 "atomic_exchange$I$a")
-OPTAB_D (atomic_fetch_add_optab, "atomic_fetch_add$I$a")
+OPTAB_D (atomic_fetch_add_optab, "atomic_fetch_add$F$a")
+OPTAB_NX (atomic_fetch_add_optab, "atomic_fetch_add$I$a")
 OPTAB_D (atomic_fetch_and_optab, "atomic_fetch_and$I$a")
 OPTAB_D (atomic_fetch_nand_optab, "atomic_fetch_nand$I$a")
 OPTAB_D (atomic_fetch_or_optab, "atomic_fetch_or$I$a")
 OPTAB_D (atomic_fetch_sub_optab, "atomic_fetch_sub$I$a")
+OPTAB_NX (atomic_fetch_sub_optab, "atomic_fetch_sub$F$a")
 OPTAB_D (atomic_fetch_xor_optab, "atomic_fetch_xor$I$a")
 OPTAB_D (atomic_load_optab, "atomic_load$I$a")
 OPTAB_D (atomic_nand_fetch_optab, "atomic_nand_fetch$I$a")
@@ -524,6 +527,7 @@  OPTAB_D (atomic_or_fetch_optab, "atomic_or_fetch$I$a")
 OPTAB_D (atomic_or_optab, "atomic_or$I$a")
 OPTAB_D (atomic_store_optab, "atomic_store$I$a")
 OPTAB_D (atomic_sub_fetch_optab, "atomic_sub_fetch$I$a")
+OPTAB_NX (atomic_sub_fetch_optab, "atomic_sub_fetch$F$a")
 OPTAB_D (atomic_sub_optab, "atomic_sub$I$a")
 OPTAB_D (atomic_xor_fetch_optab, "atomic_xor_fetch$I$a")
 OPTAB_D (atomic_xor_optab, "atomic_xor$I$a")
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 301847e2186..8da637e87b6 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -366,7 +366,7 @@  extern void expand_mem_signal_fence (enum memmodel);
 rtx expand_atomic_load (rtx, rtx, enum memmodel);
 rtx expand_atomic_store (rtx, rtx, enum memmodel, bool);
 rtx expand_atomic_fetch_op (rtx, rtx, rtx, enum rtx_code, enum memmodel, 
-			      bool);
+			      bool, machine_mode load_mode = VOIDmode);
 
 extern void expand_asm_reg_clobber_mem_blockage (HARD_REG_SET);