diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 0b902896ddd..0ffd7d0b211 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -6394,6 +6394,46 @@ get_builtin_sync_mode (int fcode_diff)
   return int_mode_for_size (BITS_PER_UNIT << fcode_diff, 0).require ();
 }
 
+/* Reconsitute the machine modes relevant for this builtin operation from the
+   builtin difference from the _N version of a fetch_add atomic.
+
+   Only works for floating point atomic builtins.
+   FCODE_DIFF should be fcode - base, where base is the FOO_N code for the
+   group of builtins.  N.b. this is a different base to that used by
+   `get_builtin_sync_mode` because that matches the builtin enum offset used in
+   c-common.cc to find the builtin enum from a given MODE.
+
+   TODO Really do need to figure out a bit neater code here.  Should not be
+   inlining the mapping from type to offset in two different places.  */
+static inline machine_mode
+get_builtin_fp_sync_mode (int fcode_diff, machine_mode *mode)
+{
+  struct type_to_offset { tree type; size_t offset; };
+  static const struct type_to_offset fp_type_mappings[] = {
+    { float_type_node, 6 },
+    { double_type_node, 7 },
+    { long_double_type_node, 8 },
+    { bfloat16_type_node ? bfloat16_type_node : error_mark_node, 9 },
+    { float16_type_node ? float16_type_node : error_mark_node, 10 },
+    { float32_type_node ? float32_type_node : error_mark_node, 11 },
+    { float64_type_node ? float64_type_node : error_mark_node, 12 },
+    { float128_type_node ? float128_type_node : error_mark_node, 13 },
+    { float32x_type_node ? float32x_type_node : error_mark_node, 14 },
+    { float64x_type_node ? float64x_type_node : error_mark_node, 15 }
+  };
+ gcc_assert (fcode_diff <= 15 && fcode_diff >= 6);
+ for (size_t i = 0; i < sizeof(fp_type_mappings)/sizeof(fp_type_mappings[0]); i++)
+   {
+     if ((size_t)fcode_diff == fp_type_mappings[i].offset)
+       {
+	 *mode = TYPE_MODE (fp_type_mappings[i].type);
+	 return int_mode_for_size (GET_MODE_SIZE (*mode) * BITS_PER_UNIT, 0)
+	   .require ();
+       }
+  } 
+ gcc_unreachable ();
+}
+
 /* Expand the memory expression LOC and return the appropriate memory operand
    for the builtin_sync operations.  */
 
@@ -6886,9 +6926,10 @@ expand_builtin_atomic_store (machine_mode mode, tree exp)
    resolved to an instruction sequence.  */
 
 static rtx
-expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
+expand_builtin_atomic_fetch_op (machine_mode expand_mode, tree exp, rtx target,
 				enum rtx_code code, bool fetch_after,
-				bool ignore, enum built_in_function ext_call)
+				bool ignore, enum built_in_function ext_call,
+				machine_mode load_mode = VOIDmode)
 {
   rtx val, mem, ret;
   enum memmodel model;
@@ -6898,13 +6939,13 @@ expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
   model = get_memmodel (CALL_EXPR_ARG (exp, 2));
 
   /* Expand the operands.  */
-  mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), mode);
-  val = expand_expr_force_mode (CALL_EXPR_ARG (exp, 1), mode);
+  mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), expand_mode);
+  val = expand_expr_force_mode (CALL_EXPR_ARG (exp, 1), expand_mode);
 
   /* Only try generating instructions if inlining is turned on.  */
   if (flag_inline_atomics)
     {
-      ret = expand_atomic_fetch_op (target, mem, val, code, model, fetch_after);
+      ret = expand_atomic_fetch_op (target, mem, val, code, model, fetch_after, load_mode);
       if (ret)
 	return ret;
     }
@@ -6938,12 +6979,12 @@ expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
     {
       if (code == NOT)
 	{
-	  ret = expand_simple_binop (mode, AND, ret, val, NULL_RTX, true,
+	  ret = expand_simple_binop (expand_mode, AND, ret, val, NULL_RTX, true,
 				     OPTAB_LIB_WIDEN);
-	  ret = expand_simple_unop (mode, NOT, ret, target, true);
+	  ret = expand_simple_unop (expand_mode, NOT, ret, target, true);
 	}
       else
-	ret = expand_simple_binop (mode, code, ret, val, target, true,
+	ret = expand_simple_binop (expand_mode, code, ret, val, target, true,
 				   OPTAB_LIB_WIDEN);
     }
   return ret;
@@ -8779,7 +8820,7 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       if (target)
 	return target;
       break;
- 
+
     case BUILT_IN_ATOMIC_FETCH_SUB_1:
     case BUILT_IN_ATOMIC_FETCH_SUB_2:
     case BUILT_IN_ATOMIC_FETCH_SUB_4:
@@ -8840,6 +8881,100 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
 	return target;
       break;
 
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FP:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPL:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF16B:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF16:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF32:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF64:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF128:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF32X:
+    case BUILT_IN_ATOMIC_FETCH_ADD_FPF64X:
+      {
+      machine_mode int_mode
+	= get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_FETCH_ADD_N, &mode);
+      target = expand_builtin_atomic_fetch_op (mode, exp, target, PLUS, false,
+					       ignore, BUILT_IN_NONE, int_mode);
+      if (target)
+	return target;
+      break;
+      }
+
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FP:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPL:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF16B:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF16:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF32:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF64:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF128:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF32X:
+    case BUILT_IN_ATOMIC_ADD_FETCH_FPF64X:
+      {
+      /* TODO I don't translate to the FETCH_ADD library call if this fails
+	 to inline.  The integral ADD_FETCH versions of atomic functions do.
+	 I don't understand why they make that transformation, could *guess*
+	 that it's the more likely function to be implemented except that
+	 libatomic seems to implement everything if it implements anything.
+	 -- Any explanation why the integral versions make this translation
+	 (and hence help with whether these floating point versions should make
+	 that translation) would be welcomed.
+
+	 A comment in gcc.dg/atomic-noinline.c seems to imply that such a
+	 translation was necessary at one point.  That comment was added to the
+	 testsuite file before the introduction of libatomic to the GCC target
+	 library.  I guess this was something needed in an earlier state of the
+	 ecosystem.  */
+      machine_mode int_mode
+	= get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_ADD_FETCH_N, &mode);
+      target = expand_builtin_atomic_fetch_op (mode, exp, target, PLUS, true,
+					       ignore, BUILT_IN_NONE, int_mode);
+      if (target)
+	return target;
+      break;
+      }
+
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FP:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPL:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF16B:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF16:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF32:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF64:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF128:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF32X:
+    case BUILT_IN_ATOMIC_FETCH_SUB_FPF64X:
+      {
+      machine_mode int_mode
+	= get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_FETCH_SUB_N, &mode);
+      target = expand_builtin_atomic_fetch_op (mode, exp, target, MINUS, false,
+					       ignore, BUILT_IN_NONE, int_mode);
+      if (target)
+	return target;
+      break;
+      }
+
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FP:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPL:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF16B:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF16:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF32:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF64:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF128:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF32X:
+    case BUILT_IN_ATOMIC_SUB_FETCH_FPF64X:
+      {
+      machine_mode int_mode
+	= get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_SUB_FETCH_N, &mode);
+      target = expand_builtin_atomic_fetch_op (mode, exp, target, MINUS, true,
+					       ignore, BUILT_IN_NONE, int_mode);
+      if (target)
+	return target;
+      break;
+      }
+
     case BUILT_IN_ATOMIC_TEST_AND_SET:
       target = expand_builtin_atomic_test_and_set (exp, target);
       if (target)
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 185c5b1a705..ca395dde89b 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -7745,6 +7745,10 @@ expand_atomic_fetch_op_no_fallback (rtx target, rtx mem, rtx val,
   if (result)
     return result;
 
+  /* TODO For floating point is there anything extra to worry about
+     w.r.t. rounding (i.e. is X+<some constant> guaranteed to be equal
+     to X-(-1 * <some constant>)).
+     Doubt it is, but wouldn't want to avoid the operation on a hunch.  */
   /* If the fetch value can be calculated from the other variation of fetch,
      try that operation.  */
   if (after || unused_result || optab.reverse_code != UNKNOWN)
@@ -7793,7 +7797,8 @@ expand_atomic_fetch_op_no_fallback (rtx target, rtx mem, rtx val,
    AFTER is false to return the value before the operation (fetch_OP).  */
 rtx
 expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
-			enum memmodel model, bool after)
+			enum memmodel model, bool after,
+			machine_mode load_mode)
 {
   machine_mode mode = GET_MODE (mem);
   rtx result;
@@ -7802,7 +7807,7 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
   /* If loads are not atomic for the required size and we are not called to
      provide a __sync builtin, do not do anything so that we stay consistent
      with atomic loads of the same size.  */
-  if (!can_atomic_load_p (mode) && !is_mm_sync (model))
+  if (!can_atomic_load_p (load_mode) && !is_mm_sync (model))
     return NULL_RTX;
 
   result = expand_atomic_fetch_op_no_fallback (target, mem, val, code, model,
@@ -7817,6 +7822,13 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
       rtx tmp;
       enum rtx_code reverse = (code == PLUS ? MINUS : PLUS);
 
+      /* TODO Need to double-check whether there's any floating point problems
+	 with doing the reverse operation on a negated value.
+	 (Don't know of any particular problem -- just have this feeling that
+	 floating point transformations are tricky).
+
+	 FWIW I have the impression this is fine because GCC optimizes x + (-y)
+	 to x - y for floating point values.  */
       start_sequence ();
       tmp = expand_simple_unop (mode, NEG, val, NULL_RTX, true);
       result = expand_atomic_fetch_op_no_fallback (target, mem, tmp, reverse,
@@ -7835,7 +7847,7 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
     }
 
   /* Try the __sync libcalls only if we can't do compare-and-swap inline.  */
-  if (!can_compare_and_swap_p (mode, false))
+  if (!can_compare_and_swap_p (load_mode, false))
     {
       rtx libfunc;
       bool fixup = false;
@@ -7870,11 +7882,41 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
       code = orig_code;
     }
 
-  /* If nothing else has succeeded, default to a compare and swap loop.  */
-  if (can_compare_and_swap_p (mode, true))
+    /* If nothing else has succeeded, default to a compare and swap loop.
+
+       N.b. for modes where the compare and swap has to happen in a different
+       mode to the operation we have to do a conversion in between the integral
+       value that the CAS loop is going to be using and the other mode that our
+       operations are performed in.  This happens when modes where
+       load_mode != mode, e.g. where `mode` is a floating point mode and
+       `load_mode` is an integral one.  */
+  if (can_compare_and_swap_p (load_mode, true))
     {
+      /* Should have been ensured by the caller, but nice to make sure.  */
+      gcc_assert (known_eq (GET_MODE_SIZE (mode), GET_MODE_SIZE (load_mode)));
+      poly_uint64 loadmode_bitsize = GET_MODE_SIZE (load_mode) * BITS_PER_UNIT;
       rtx_insn *insn;
-      rtx t0 = gen_reg_rtx (mode), t1;
+      rtx t0 = gen_reg_rtx (load_mode), t1;
+      rtx tmp = gen_reg_rtx (mode);
+      /* TODO Is there a better way than this to convert between
+	 interpretations?   We need bitwise interpretation because the atomic
+	 memory operations are being performed on an integral register.  */
+      auto interpret_as_float =
+	[loadmode_bitsize, mode] (rtx target, rtx irtx) -> rtx {
+	rtx tmp = extract_bit_field (irtx, loadmode_bitsize, 0, true, target,
+				     mode, mode, false, NULL);
+	if (tmp != target)
+	  emit_move_insn (target, tmp);
+	return target;
+      };
+      auto interpret_as_int
+	= [loadmode_bitsize, load_mode] (rtx target, rtx frtx) -> rtx {
+	rtx tmp = extract_bit_field (frtx, loadmode_bitsize, 0, true, target,
+				     load_mode, load_mode, false, NULL);
+	if (tmp != target)
+	  emit_move_insn (target, tmp);
+	return target;
+      };
 
       start_sequence ();
 
@@ -7885,7 +7927,12 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
 	    target = gen_reg_rtx (mode);
 	  /* If fetch_before, copy the value now.  */
 	  if (!after)
-	    emit_move_insn (target, t0);
+	    {
+	      if (load_mode == mode)
+		emit_move_insn (target, t0);
+	      else
+		interpret_as_float (target, t0);
+	    }
 	}
       else
         target = const0_rtx;
@@ -7897,18 +7944,52 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
 				    true, OPTAB_LIB_WIDEN);
 	  t1 = expand_simple_unop (mode, code, t1, NULL_RTX, true);
 	}
-      else
+      else if (load_mode == mode)
 	t1 = expand_simple_binop (mode, code, t1, val, NULL_RTX, true, 
 				  OPTAB_LIB_WIDEN);
+      else
+	{
+	  interpret_as_float (tmp, t1);
+	  tmp = expand_simple_binop (mode, code, tmp, val, NULL_RTX, true,
+				     OPTAB_LIB_WIDEN);
+	  t1 = gen_reg_rtx (load_mode);
+	  interpret_as_int (t1, tmp);
+	}
 
       /* For after, copy the value now.  */
       if (!unused_result && after)
-        emit_move_insn (target, t1);
+        emit_move_insn (target, load_mode == mode ? t1 : tmp);
       insn = get_insns ();
       end_sequence ();
 
+      /*
+	Outside `expand_compare_and_swap_loop` (i.e. inside the `seq`) I've
+	done the following:
+	    tmp (floating) = old_reg (integral)
+	    tmp += 1
+	    new_reg (integral) = tmp (floating)
+	`expand_compare_and_swap_loop` wraps the sequence it's given as
+	described at the top of its implementation.
+	  cmp_reg = mem
+	label:
+	  old_reg = cmp_reg;
+	  tmp (floating) = old_reg (integral)
+	  tmp += 1;
+	  new_reg (integral) = tmp (floating)
+	  (success, cmp_reg) = CAS(mem, old_reg, new_reg)
+	  if (success)
+	    goto label;
+
+	In order to implement this what we want is to expand the MEM as an
+	integral value before passing into this function.  Then this function
+	would not have to understand anything about the fact that the inner
+	thing is a floating point operation.
+	  - N.b. there is the question of whether we'd like a conversion
+	    inside or outside the loop.  I don't think it matters TBH, though
+	    could easily be missing something here.  */
+      mem = adjust_address (mem, load_mode, 0);
       if (t1 != NULL && expand_compare_and_swap_loop (mem, t0, t1, insn))
-        return target;
+	return target;
     }
 
   return NULL_RTX;
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 45e117a7f50..a450c4bba81 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -503,6 +503,7 @@ OPTAB_D (sync_sub_optab, "sync_sub$I$a")
 OPTAB_D (sync_xor_optab, "sync_xor$I$a")
 
 OPTAB_D (atomic_add_fetch_optab, "atomic_add_fetch$I$a")
+OPTAB_NX (atomic_add_fetch_optab, "atomic_add_fetch$F$a")
 OPTAB_D (atomic_add_optab, "atomic_add$I$a")
 OPTAB_D (atomic_and_fetch_optab, "atomic_and_fetch$I$a")
 OPTAB_D (atomic_and_optab, "atomic_and$I$a")
@@ -511,11 +512,13 @@ OPTAB_D (atomic_bit_test_and_complement_optab, "atomic_bit_test_and_complement$I
 OPTAB_D (atomic_bit_test_and_reset_optab, "atomic_bit_test_and_reset$I$a")
 OPTAB_D (atomic_compare_and_swap_optab, "atomic_compare_and_swap$I$a")
 OPTAB_D (atomic_exchange_optab,	 "atomic_exchange$I$a")
-OPTAB_D (atomic_fetch_add_optab, "atomic_fetch_add$I$a")
+OPTAB_D (atomic_fetch_add_optab, "atomic_fetch_add$F$a")
+OPTAB_NX (atomic_fetch_add_optab, "atomic_fetch_add$I$a")
 OPTAB_D (atomic_fetch_and_optab, "atomic_fetch_and$I$a")
 OPTAB_D (atomic_fetch_nand_optab, "atomic_fetch_nand$I$a")
 OPTAB_D (atomic_fetch_or_optab, "atomic_fetch_or$I$a")
 OPTAB_D (atomic_fetch_sub_optab, "atomic_fetch_sub$I$a")
+OPTAB_NX (atomic_fetch_sub_optab, "atomic_fetch_sub$F$a")
 OPTAB_D (atomic_fetch_xor_optab, "atomic_fetch_xor$I$a")
 OPTAB_D (atomic_load_optab, "atomic_load$I$a")
 OPTAB_D (atomic_nand_fetch_optab, "atomic_nand_fetch$I$a")
@@ -524,6 +527,7 @@ OPTAB_D (atomic_or_fetch_optab, "atomic_or_fetch$I$a")
 OPTAB_D (atomic_or_optab, "atomic_or$I$a")
 OPTAB_D (atomic_store_optab, "atomic_store$I$a")
 OPTAB_D (atomic_sub_fetch_optab, "atomic_sub_fetch$I$a")
+OPTAB_NX (atomic_sub_fetch_optab, "atomic_sub_fetch$F$a")
 OPTAB_D (atomic_sub_optab, "atomic_sub$I$a")
 OPTAB_D (atomic_xor_fetch_optab, "atomic_xor_fetch$I$a")
 OPTAB_D (atomic_xor_optab, "atomic_xor$I$a")
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 301847e2186..8da637e87b6 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -366,7 +366,7 @@ extern void expand_mem_signal_fence (enum memmodel);
 rtx expand_atomic_load (rtx, rtx, enum memmodel);
 rtx expand_atomic_store (rtx, rtx, enum memmodel, bool);
 rtx expand_atomic_fetch_op (rtx, rtx, rtx, enum rtx_code, enum memmodel, 
-			      bool);
+			      bool, machine_mode load_mode = VOIDmode);
 
 extern void expand_asm_reg_clobber_mem_blockage (HARD_REG_SET);
 
