@@ -6394,6 +6394,46 @@ get_builtin_sync_mode (int fcode_diff)
return int_mode_for_size (BITS_PER_UNIT << fcode_diff, 0).require ();
}
+/* Reconsitute the machine modes relevant for this builtin operation from the
+ builtin difference from the _N version of a fetch_add atomic.
+
+ Only works for floating point atomic builtins.
+ FCODE_DIFF should be fcode - base, where base is the FOO_N code for the
+ group of builtins. N.b. this is a different base to that used by
+ `get_builtin_sync_mode` because that matches the builtin enum offset used in
+ c-common.cc to find the builtin enum from a given MODE.
+
+ TODO Really do need to figure out a bit neater code here. Should not be
+ inlining the mapping from type to offset in two different places. */
+static inline machine_mode
+get_builtin_fp_sync_mode (int fcode_diff, machine_mode *mode)
+{
+ struct type_to_offset { tree type; size_t offset; };
+ static const struct type_to_offset fp_type_mappings[] = {
+ { float_type_node, 6 },
+ { double_type_node, 7 },
+ { long_double_type_node, 8 },
+ { bfloat16_type_node ? bfloat16_type_node : error_mark_node, 9 },
+ { float16_type_node ? float16_type_node : error_mark_node, 10 },
+ { float32_type_node ? float32_type_node : error_mark_node, 11 },
+ { float64_type_node ? float64_type_node : error_mark_node, 12 },
+ { float128_type_node ? float128_type_node : error_mark_node, 13 },
+ { float32x_type_node ? float32x_type_node : error_mark_node, 14 },
+ { float64x_type_node ? float64x_type_node : error_mark_node, 15 }
+ };
+ gcc_assert (fcode_diff <= 15 && fcode_diff >= 6);
+ for (size_t i = 0; i < sizeof(fp_type_mappings)/sizeof(fp_type_mappings[0]); i++)
+ {
+ if ((size_t)fcode_diff == fp_type_mappings[i].offset)
+ {
+ *mode = TYPE_MODE (fp_type_mappings[i].type);
+ return int_mode_for_size (GET_MODE_SIZE (*mode) * BITS_PER_UNIT, 0)
+ .require ();
+ }
+ }
+ gcc_unreachable ();
+}
+
/* Expand the memory expression LOC and return the appropriate memory operand
for the builtin_sync operations. */
@@ -6886,9 +6926,10 @@ expand_builtin_atomic_store (machine_mode mode, tree exp)
resolved to an instruction sequence. */
static rtx
-expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
+expand_builtin_atomic_fetch_op (machine_mode expand_mode, tree exp, rtx target,
enum rtx_code code, bool fetch_after,
- bool ignore, enum built_in_function ext_call)
+ bool ignore, enum built_in_function ext_call,
+ machine_mode load_mode = VOIDmode)
{
rtx val, mem, ret;
enum memmodel model;
@@ -6898,13 +6939,13 @@ expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
model = get_memmodel (CALL_EXPR_ARG (exp, 2));
/* Expand the operands. */
- mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), mode);
- val = expand_expr_force_mode (CALL_EXPR_ARG (exp, 1), mode);
+ mem = get_builtin_sync_mem (CALL_EXPR_ARG (exp, 0), expand_mode);
+ val = expand_expr_force_mode (CALL_EXPR_ARG (exp, 1), expand_mode);
/* Only try generating instructions if inlining is turned on. */
if (flag_inline_atomics)
{
- ret = expand_atomic_fetch_op (target, mem, val, code, model, fetch_after);
+ ret = expand_atomic_fetch_op (target, mem, val, code, model, fetch_after, load_mode);
if (ret)
return ret;
}
@@ -6938,12 +6979,12 @@ expand_builtin_atomic_fetch_op (machine_mode mode, tree exp, rtx target,
{
if (code == NOT)
{
- ret = expand_simple_binop (mode, AND, ret, val, NULL_RTX, true,
+ ret = expand_simple_binop (expand_mode, AND, ret, val, NULL_RTX, true,
OPTAB_LIB_WIDEN);
- ret = expand_simple_unop (mode, NOT, ret, target, true);
+ ret = expand_simple_unop (expand_mode, NOT, ret, target, true);
}
else
- ret = expand_simple_binop (mode, code, ret, val, target, true,
+ ret = expand_simple_binop (expand_mode, code, ret, val, target, true,
OPTAB_LIB_WIDEN);
}
return ret;
@@ -8779,7 +8820,7 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
if (target)
return target;
break;
-
+
case BUILT_IN_ATOMIC_FETCH_SUB_1:
case BUILT_IN_ATOMIC_FETCH_SUB_2:
case BUILT_IN_ATOMIC_FETCH_SUB_4:
@@ -8840,6 +8881,100 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
return target;
break;
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPF:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FP:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPL:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPF16B:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPF16:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPF32:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPF64:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPF128:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPF32X:
+ case BUILT_IN_ATOMIC_FETCH_ADD_FPF64X:
+ {
+ machine_mode int_mode
+ = get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_FETCH_ADD_N, &mode);
+ target = expand_builtin_atomic_fetch_op (mode, exp, target, PLUS, false,
+ ignore, BUILT_IN_NONE, int_mode);
+ if (target)
+ return target;
+ break;
+ }
+
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPF:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FP:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPL:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPF16B:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPF16:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPF32:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPF64:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPF128:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPF32X:
+ case BUILT_IN_ATOMIC_ADD_FETCH_FPF64X:
+ {
+ /* TODO I don't translate to the FETCH_ADD library call if this fails
+ to inline. The integral ADD_FETCH versions of atomic functions do.
+ I don't understand why they make that transformation, could *guess*
+ that it's the more likely function to be implemented except that
+ libatomic seems to implement everything if it implements anything.
+ -- Any explanation why the integral versions make this translation
+ (and hence help with whether these floating point versions should make
+ that translation) would be welcomed.
+
+ A comment in gcc.dg/atomic-noinline.c seems to imply that such a
+ translation was necessary at one point. That comment was added to the
+ testsuite file before the introduction of libatomic to the GCC target
+ library. I guess this was something needed in an earlier state of the
+ ecosystem. */
+ machine_mode int_mode
+ = get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_ADD_FETCH_N, &mode);
+ target = expand_builtin_atomic_fetch_op (mode, exp, target, PLUS, true,
+ ignore, BUILT_IN_NONE, int_mode);
+ if (target)
+ return target;
+ break;
+ }
+
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPF:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FP:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPL:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPF16B:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPF16:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPF32:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPF64:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPF128:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPF32X:
+ case BUILT_IN_ATOMIC_FETCH_SUB_FPF64X:
+ {
+ machine_mode int_mode
+ = get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_FETCH_SUB_N, &mode);
+ target = expand_builtin_atomic_fetch_op (mode, exp, target, MINUS, false,
+ ignore, BUILT_IN_NONE, int_mode);
+ if (target)
+ return target;
+ break;
+ }
+
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPF:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FP:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPL:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPF16B:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPF16:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPF32:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPF64:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPF128:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPF32X:
+ case BUILT_IN_ATOMIC_SUB_FETCH_FPF64X:
+ {
+ machine_mode int_mode
+ = get_builtin_fp_sync_mode (fcode - BUILT_IN_ATOMIC_SUB_FETCH_N, &mode);
+ target = expand_builtin_atomic_fetch_op (mode, exp, target, MINUS, true,
+ ignore, BUILT_IN_NONE, int_mode);
+ if (target)
+ return target;
+ break;
+ }
+
case BUILT_IN_ATOMIC_TEST_AND_SET:
target = expand_builtin_atomic_test_and_set (exp, target);
if (target)
@@ -7745,6 +7745,10 @@ expand_atomic_fetch_op_no_fallback (rtx target, rtx mem, rtx val,
if (result)
return result;
+ /* TODO For floating point is there anything extra to worry about
+ w.r.t. rounding (i.e. is X+<some constant> guaranteed to be equal
+ to X-(-1 * <some constant>)).
+ Doubt it is, but wouldn't want to avoid the operation on a hunch. */
/* If the fetch value can be calculated from the other variation of fetch,
try that operation. */
if (after || unused_result || optab.reverse_code != UNKNOWN)
@@ -7793,7 +7797,8 @@ expand_atomic_fetch_op_no_fallback (rtx target, rtx mem, rtx val,
AFTER is false to return the value before the operation (fetch_OP). */
rtx
expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
- enum memmodel model, bool after)
+ enum memmodel model, bool after,
+ machine_mode load_mode)
{
machine_mode mode = GET_MODE (mem);
rtx result;
@@ -7802,7 +7807,7 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
/* If loads are not atomic for the required size and we are not called to
provide a __sync builtin, do not do anything so that we stay consistent
with atomic loads of the same size. */
- if (!can_atomic_load_p (mode) && !is_mm_sync (model))
+ if (!can_atomic_load_p (load_mode) && !is_mm_sync (model))
return NULL_RTX;
result = expand_atomic_fetch_op_no_fallback (target, mem, val, code, model,
@@ -7817,6 +7822,13 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
rtx tmp;
enum rtx_code reverse = (code == PLUS ? MINUS : PLUS);
+ /* TODO Need to double-check whether there's any floating point problems
+ with doing the reverse operation on a negated value.
+ (Don't know of any particular problem -- just have this feeling that
+ floating point transformations are tricky).
+
+ FWIW I have the impression this is fine because GCC optimizes x + (-y)
+ to x - y for floating point values. */
start_sequence ();
tmp = expand_simple_unop (mode, NEG, val, NULL_RTX, true);
result = expand_atomic_fetch_op_no_fallback (target, mem, tmp, reverse,
@@ -7835,7 +7847,7 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
}
/* Try the __sync libcalls only if we can't do compare-and-swap inline. */
- if (!can_compare_and_swap_p (mode, false))
+ if (!can_compare_and_swap_p (load_mode, false))
{
rtx libfunc;
bool fixup = false;
@@ -7870,11 +7882,41 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
code = orig_code;
}
- /* If nothing else has succeeded, default to a compare and swap loop. */
- if (can_compare_and_swap_p (mode, true))
+ /* If nothing else has succeeded, default to a compare and swap loop.
+
+ N.b. for modes where the compare and swap has to happen in a different
+ mode to the operation we have to do a conversion in between the integral
+ value that the CAS loop is going to be using and the other mode that our
+ operations are performed in. This happens when modes where
+ load_mode != mode, e.g. where `mode` is a floating point mode and
+ `load_mode` is an integral one. */
+ if (can_compare_and_swap_p (load_mode, true))
{
+ /* Should have been ensured by the caller, but nice to make sure. */
+ gcc_assert (known_eq (GET_MODE_SIZE (mode), GET_MODE_SIZE (load_mode)));
+ poly_uint64 loadmode_bitsize = GET_MODE_SIZE (load_mode) * BITS_PER_UNIT;
rtx_insn *insn;
- rtx t0 = gen_reg_rtx (mode), t1;
+ rtx t0 = gen_reg_rtx (load_mode), t1;
+ rtx tmp = gen_reg_rtx (mode);
+ /* TODO Is there a better way than this to convert between
+ interpretations? We need bitwise interpretation because the atomic
+ memory operations are being performed on an integral register. */
+ auto interpret_as_float =
+ [loadmode_bitsize, mode] (rtx target, rtx irtx) -> rtx {
+ rtx tmp = extract_bit_field (irtx, loadmode_bitsize, 0, true, target,
+ mode, mode, false, NULL);
+ if (tmp != target)
+ emit_move_insn (target, tmp);
+ return target;
+ };
+ auto interpret_as_int
+ = [loadmode_bitsize, load_mode] (rtx target, rtx frtx) -> rtx {
+ rtx tmp = extract_bit_field (frtx, loadmode_bitsize, 0, true, target,
+ load_mode, load_mode, false, NULL);
+ if (tmp != target)
+ emit_move_insn (target, tmp);
+ return target;
+ };
start_sequence ();
@@ -7885,7 +7927,12 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
target = gen_reg_rtx (mode);
/* If fetch_before, copy the value now. */
if (!after)
- emit_move_insn (target, t0);
+ {
+ if (load_mode == mode)
+ emit_move_insn (target, t0);
+ else
+ interpret_as_float (target, t0);
+ }
}
else
target = const0_rtx;
@@ -7897,18 +7944,52 @@ expand_atomic_fetch_op (rtx target, rtx mem, rtx val, enum rtx_code code,
true, OPTAB_LIB_WIDEN);
t1 = expand_simple_unop (mode, code, t1, NULL_RTX, true);
}
- else
+ else if (load_mode == mode)
t1 = expand_simple_binop (mode, code, t1, val, NULL_RTX, true,
OPTAB_LIB_WIDEN);
+ else
+ {
+ interpret_as_float (tmp, t1);
+ tmp = expand_simple_binop (mode, code, tmp, val, NULL_RTX, true,
+ OPTAB_LIB_WIDEN);
+ t1 = gen_reg_rtx (load_mode);
+ interpret_as_int (t1, tmp);
+ }
/* For after, copy the value now. */
if (!unused_result && after)
- emit_move_insn (target, t1);
+ emit_move_insn (target, load_mode == mode ? t1 : tmp);
insn = get_insns ();
end_sequence ();
+ /*
+ Outside `expand_compare_and_swap_loop` (i.e. inside the `seq`) I've
+ done the following:
+ tmp (floating) = old_reg (integral)
+ tmp += 1
+ new_reg (integral) = tmp (floating)
+ `expand_compare_and_swap_loop` wraps the sequence it's given as
+ described at the top of its implementation.
+ cmp_reg = mem
+ label:
+ old_reg = cmp_reg;
+ tmp (floating) = old_reg (integral)
+ tmp += 1;
+ new_reg (integral) = tmp (floating)
+ (success, cmp_reg) = CAS(mem, old_reg, new_reg)
+ if (success)
+ goto label;
+
+ In order to implement this what we want is to expand the MEM as an
+ integral value before passing into this function. Then this function
+ would not have to understand anything about the fact that the inner
+ thing is a floating point operation.
+ - N.b. there is the question of whether we'd like a conversion
+ inside or outside the loop. I don't think it matters TBH, though
+ could easily be missing something here. */
+ mem = adjust_address (mem, load_mode, 0);
if (t1 != NULL && expand_compare_and_swap_loop (mem, t0, t1, insn))
- return target;
+ return target;
}
return NULL_RTX;
@@ -503,6 +503,7 @@ OPTAB_D (sync_sub_optab, "sync_sub$I$a")
OPTAB_D (sync_xor_optab, "sync_xor$I$a")
OPTAB_D (atomic_add_fetch_optab, "atomic_add_fetch$I$a")
+OPTAB_NX (atomic_add_fetch_optab, "atomic_add_fetch$F$a")
OPTAB_D (atomic_add_optab, "atomic_add$I$a")
OPTAB_D (atomic_and_fetch_optab, "atomic_and_fetch$I$a")
OPTAB_D (atomic_and_optab, "atomic_and$I$a")
@@ -511,11 +512,13 @@ OPTAB_D (atomic_bit_test_and_complement_optab, "atomic_bit_test_and_complement$I
OPTAB_D (atomic_bit_test_and_reset_optab, "atomic_bit_test_and_reset$I$a")
OPTAB_D (atomic_compare_and_swap_optab, "atomic_compare_and_swap$I$a")
OPTAB_D (atomic_exchange_optab, "atomic_exchange$I$a")
-OPTAB_D (atomic_fetch_add_optab, "atomic_fetch_add$I$a")
+OPTAB_D (atomic_fetch_add_optab, "atomic_fetch_add$F$a")
+OPTAB_NX (atomic_fetch_add_optab, "atomic_fetch_add$I$a")
OPTAB_D (atomic_fetch_and_optab, "atomic_fetch_and$I$a")
OPTAB_D (atomic_fetch_nand_optab, "atomic_fetch_nand$I$a")
OPTAB_D (atomic_fetch_or_optab, "atomic_fetch_or$I$a")
OPTAB_D (atomic_fetch_sub_optab, "atomic_fetch_sub$I$a")
+OPTAB_NX (atomic_fetch_sub_optab, "atomic_fetch_sub$F$a")
OPTAB_D (atomic_fetch_xor_optab, "atomic_fetch_xor$I$a")
OPTAB_D (atomic_load_optab, "atomic_load$I$a")
OPTAB_D (atomic_nand_fetch_optab, "atomic_nand_fetch$I$a")
@@ -524,6 +527,7 @@ OPTAB_D (atomic_or_fetch_optab, "atomic_or_fetch$I$a")
OPTAB_D (atomic_or_optab, "atomic_or$I$a")
OPTAB_D (atomic_store_optab, "atomic_store$I$a")
OPTAB_D (atomic_sub_fetch_optab, "atomic_sub_fetch$I$a")
+OPTAB_NX (atomic_sub_fetch_optab, "atomic_sub_fetch$F$a")
OPTAB_D (atomic_sub_optab, "atomic_sub$I$a")
OPTAB_D (atomic_xor_fetch_optab, "atomic_xor_fetch$I$a")
OPTAB_D (atomic_xor_optab, "atomic_xor$I$a")
@@ -366,7 +366,7 @@ extern void expand_mem_signal_fence (enum memmodel);
rtx expand_atomic_load (rtx, rtx, enum memmodel);
rtx expand_atomic_store (rtx, rtx, enum memmodel, bool);
rtx expand_atomic_fetch_op (rtx, rtx, rtx, enum rtx_code, enum memmodel,
- bool);
+ bool, machine_mode load_mode = VOIDmode);
extern void expand_asm_reg_clobber_mem_blockage (HARD_REG_SET);