AArch64: Fix __sync_val_compare_and_swap [PR111404]
Checks
Context |
Check |
Description |
linaro-tcwg-bot/tcwg_gcc_build--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-arm |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 |
success
|
Testing passed
|
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 |
success
|
Testing passed
|
Commit Message
__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop. On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails. In this case the value returned
is not read atomically.
Passes regress/bootstrap, OK for commit?
gcc/ChangeLog/
PR target/111404
* config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
For 128-bit store the loaded value and loop if needed.
libgcc/ChangeLog/
PR target/111404
* config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
either new value or loaded value.
---
Comments
On Wed, Sep 13, 2023 at 3:55 PM Wilco Dijkstra via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
> outline atomic code or uses an inline loop. On AArch64 LDXP is only atomic if
> the value is stored successfully using STXP, but the current implementations
> do not perform the store if the comparison fails. In this case the value returned
> is not read atomically.
IIRC, the previous discussions in this space revolved around the
difficulty with the store writing to readonly memory which is why I
think we went with LDXP in this form.
Has something changed from then ?
Reviewed-by : Ramana Radhakrishnan <ramana@gcc.gnu.org>
regards
Ramana
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
> PR target/111404
> * config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
> For 128-bit store the loaded value and loop if needed.
>
> libgcc/ChangeLog/
> PR target/111404
> * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
> either new value or loaded value.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
> mem = operands[1];
> oldval = operands[2];
> newval = operands[3];
> - is_weak = (operands[4] != const0_rtx);
> model_rtx = operands[5];
> scratch = operands[7];
> mode = GET_MODE (mem);
> model = memmodel_from_int (INTVAL (model_rtx));
> + is_weak = operands[4] != const0_rtx && mode != TImode;
>
> /* When OLDVAL is zero and we want the strong version we can emit a tighter
> loop:
> @@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
> else
> aarch64_gen_compare_reg (NE, scratch, const0_rtx);
>
> + /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
> + store the returned value and loop if the STLXP fails. */
> + if (mode == TImode)
> + {
> + rtx_code_label *label3 = gen_label_rtx ();
> + emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
> + emit_barrier ();
> +
> + emit_label (label2);
> + aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
> +
> + if (aarch64_track_speculation)
> + {
> + /* Emit an explicit compare instruction, so that we can correctly
> + track the condition codes. */
> + rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
> + x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
> + }
> + else
> + x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
> + x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> + gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
> + aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +
> + label2 = label3;
> + }
> +
> emit_label (label2);
>
> /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
> diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
> index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
> --- a/libgcc/config/aarch64/lse.S
> +++ b/libgcc/config/aarch64/lse.S
> @@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
> #define tmp0 16
> #define tmp1 17
> #define tmp2 15
> +#define tmp3 14
> +#define tmp4 13
>
> #define BTI_C hint 34
>
> @@ -233,10 +235,11 @@ STARTFN NAME(cas)
> 0: LDXP x0, x1, [x4]
> cmp x0, x(tmp0)
> ccmp x1, x(tmp1), #0, eq
> - bne 1f
> - STXP w(tmp2), x2, x3, [x4]
> - cbnz w(tmp2), 0b
> -1: BARRIER
> + csel x(tmp2), x2, x0, eq
> + csel x(tmp3), x3, x1, eq
> + STXP w(tmp4), x(tmp2), x(tmp3), [x4]
> + cbnz w(tmp4), 0b
> + BARRIER
> ret
>
> #endif
>
Hi Ramana,
>> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
>> outline atomic code or uses an inline loop. On AArch64 LDXP is only atomic if
>> the value is stored successfully using STXP, but the current implementations
>> do not perform the store if the comparison fails. In this case the value returned
>> is not read atomically.
>
> IIRC, the previous discussions in this space revolved around the
> difficulty with the store writing to readonly memory which is why I
> think we went with LDXP in this form.
That's not related to this patch - this fixes a serious atomicity bug that may
affect the Linux kernel since it uses the older sync primitives. Given that LDXP
is not atomic on its own, you have to execute the STXP even in the failure case.
Note that you can't rely on compare&swap not to write memory: load-exclusive
loops may either always write or avoid writes in the failure case if the load is
atomic. CAS instructions always write.
> Has something changed from then ?
Yes, we now know that using locking atomics was a bad decision. Developers
actually require efficient and lock-free atomics. Since we didn't support them,
many applications were forced to add their own atomic implementations using
hacky inline assembler. It also resulted in a nasty ABI incompatibility between
GCC and LLVM. Yes - atomics are part of the ABI!
All that is much worse than worrying about a theoretical corner case that
can't happen in real applications - atomics only work on writeable memory
since their purpose is to synchronize reads with writes.
Cheers,
Wilco
Hi Wilco,
Thanks for your email.
On Tue, Sep 26, 2023 at 12:07 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Ramana,
>
> >> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
> >> outline atomic code or uses an inline loop. On AArch64 LDXP is only atomic if
> >> the value is stored successfully using STXP, but the current implementations
> >> do not perform the store if the comparison fails. In this case the value returned
> >> is not read atomically.
> >
> > IIRC, the previous discussions in this space revolved around the
> > difficulty with the store writing to readonly memory which is why I
> > think we went with LDXP in this form.
>
> That's not related to this patch - this fixes a serious atomicity bug that may
> affect the Linux kernel since it uses the older sync primitives. Given that LDXP
> is not atomic on its own, you have to execute the STXP even in the failure case.
> Note that you can't rely on compare&swap not to write memory: load-exclusive
> loops may either always write or avoid writes in the failure case if the load is
> atomic. CAS instructions always write.
>
I am aware of the capabilities of the architecture.
> > Has something changed from then ?
>
> Yes, we now know that using locking atomics was a bad decision. Developers
> actually require efficient and lock-free atomics. Since we didn't support them,
> many applications were forced to add their own atomic implementations using
> hacky inline assembler. It also resulted in a nasty ABI incompatibility between
> GCC and LLVM. Yes - atomics are part of the ABI!
I agree that atomics are part of the ABI.
>
> All that is much worse than worrying about a theoretical corner case that
> can't happen in real applications - atomics only work on writeable memory
> since their purpose is to synchronize reads with writes.
I remember this to be the previous discussions and common understanding.
https://gcc.gnu.org/legacy-ml/gcc/2016-06/msg00017.html
and here
https://gcc.gnu.org/legacy-ml/gcc-patches/2017-02/msg00168.html
Can you point any discussion recently that shows this has changed and
point me at that discussion if any anywhere ? I can't find it in my
searches . Perhaps you've had the discussion some place to show it has
changed.
regards
Ramana
>
> Cheers,
> Wilco
ping
__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop. On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails. In this case the value returned
is not read atomically.
Passes regress/bootstrap, OK for commit?
gcc/ChangeLog/
PR target/111404
* config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
For 128-bit store the loaded value and loop if needed.
libgcc/ChangeLog/
PR target/111404
* config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
either new value or loaded value.
---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
mem = operands[1];
oldval = operands[2];
newval = operands[3];
- is_weak = (operands[4] != const0_rtx);
model_rtx = operands[5];
scratch = operands[7];
mode = GET_MODE (mem);
model = memmodel_from_int (INTVAL (model_rtx));
+ is_weak = operands[4] != const0_rtx && mode != TImode;
/* When OLDVAL is zero and we want the strong version we can emit a tighter
loop:
@@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
else
aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
+ store the returned value and loop if the STLXP fails. */
+ if (mode == TImode)
+ {
+ rtx_code_label *label3 = gen_label_rtx ();
+ emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+ emit_barrier ();
+
+ emit_label (label2);
+ aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+ if (aarch64_track_speculation)
+ {
+ /* Emit an explicit compare instruction, so that we can correctly
+ track the condition codes. */
+ rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+ }
+ else
+ x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+ x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+ gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+ aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+ label2 = label3;
+ }
+
emit_label (label2);
/* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define tmp0 16
#define tmp1 17
#define tmp2 15
+#define tmp3 14
+#define tmp4 13
#define BTI_C hint 34
@@ -233,10 +235,11 @@ STARTFN NAME(cas)
0: LDXP x0, x1, [x4]
cmp x0, x(tmp0)
ccmp x1, x(tmp1), #0, eq
- bne 1f
- STXP w(tmp2), x2, x3, [x4]
- cbnz w(tmp2), 0b
-1: BARRIER
+ csel x(tmp2), x2, x0, eq
+ csel x(tmp3), x3, x1, eq
+ STXP w(tmp4), x(tmp2), x(tmp3), [x4]
+ cbnz w(tmp4), 0b
+ BARRIER
ret
#endif
Hi Ramana,
> I remember this to be the previous discussions and common understanding.
>
> https://gcc.gnu.org/legacy-ml/gcc/2016-06/msg00017.html
>
> and here
>
> https://gcc.gnu.org/legacy-ml/gcc-patches/2017-02/msg00168.html
>
> Can you point any discussion recently that shows this has changed and
> point me at that discussion if any anywhere ? I can't find it in my
> searches . Perhaps you've had the discussion some place to show it has
> changed.
Here are some more recent discussions about atomics, eg. this has good
arguments from developers wanting lock-free atomics:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80878
We also had some discussion how we could handle the read-only corner
case by either giving a warning/error on const pointers to atomics or
ensuring _Atomic variables are writeable:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108659
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109553
My conclusion from that is that nobody cared enough to fix this for x86
in all these years, so it's not seen as an important issue.
We've had several internal design discussions to figure out how to fix the ABI
issues. The conclusion was that this is the only possible solution that makes
GCC and LLVM compatible without breaking backwards compatibility. It also
allows use of newer atomic instructions (which people want inlined).
Cheers,
Wilco
ping
__sync_val_compare_and_swap may be used on 128-bit types and either calls the
outline atomic code or uses an inline loop. On AArch64 LDXP is only atomic if
the value is stored successfully using STXP, but the current implementations
do not perform the store if the comparison fails. In this case the value returned
is not read atomically.
Passes regress/bootstrap, OK for commit?
gcc/ChangeLog/
PR target/111404
* config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
For 128-bit store the loaded value and loop if needed.
libgcc/ChangeLog/
PR target/111404
* config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
either new value or loaded value.
---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
mem = operands[1];
oldval = operands[2];
newval = operands[3];
- is_weak = (operands[4] != const0_rtx);
model_rtx = operands[5];
scratch = operands[7];
mode = GET_MODE (mem);
model = memmodel_from_int (INTVAL (model_rtx));
+ is_weak = operands[4] != const0_rtx && mode != TImode;
/* When OLDVAL is zero and we want the strong version we can emit a tighter
loop:
@@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
else
aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
+ store the returned value and loop if the STLXP fails. */
+ if (mode == TImode)
+ {
+ rtx_code_label *label3 = gen_label_rtx ();
+ emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+ emit_barrier ();
+
+ emit_label (label2);
+ aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+ if (aarch64_track_speculation)
+ {
+ /* Emit an explicit compare instruction, so that we can correctly
+ track the condition codes. */
+ rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+ }
+ else
+ x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+ x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+ gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+ aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+ label2 = label3;
+ }
+
emit_label (label2);
/* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define tmp0 16
#define tmp1 17
#define tmp2 15
+#define tmp3 14
+#define tmp4 13
#define BTI_C hint 34
@@ -233,10 +235,11 @@ STARTFN NAME(cas)
0: LDXP x0, x1, [x4]
cmp x0, x(tmp0)
ccmp x1, x(tmp1), #0, eq
- bne 1f
- STXP w(tmp2), x2, x3, [x4]
- cbnz w(tmp2), 0b
-1: BARRIER
+ csel x(tmp2), x2, x0, eq
+ csel x(tmp3), x3, x1, eq
+ STXP w(tmp4), x(tmp2), x(tmp3), [x4]
+ cbnz w(tmp4), 0b
+ BARRIER
ret
#endif
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> __sync_val_compare_and_swap may be used on 128-bit types and either calls the
> outline atomic code or uses an inline loop. On AArch64 LDXP is only atomic if
> the value is stored successfully using STXP, but the current implementations
> do not perform the store if the comparison fails. In this case the value returned
> is not read atomically.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
> PR target/111404
> * config/aarch64/aarch64.cc (aarch64_split_compare_and_swap):
> For 128-bit store the loaded value and loop if needed.
>
> libgcc/ChangeLog/
> PR target/111404
> * config/aarch64/lse.S (__aarch64_cas16_acq_rel): Execute STLXP using
> either new value or loaded value.
Thanks for catching this.
The new aarch64_split_compare_and_swap code looks a bit twisty.
The approach in lse.S seems more obvious. But I'm guessing you
didn't want to spend any time restructuring the pre-LSE
-mno-outline-atomics code, and I agree the patch in its current
form is more backportable.
So the patch is OK, thanks. Sorry for the delay.
I suppose we might want to backport this after it has been in trunk
for a bit.
Richard
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5e8d0a0c91bc7719de2a8c5627b354cf905a4db0..c44c0b979d0cc3755c61dcf566cfddedccebf1ea 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
> mem = operands[1];
> oldval = operands[2];
> newval = operands[3];
> - is_weak = (operands[4] != const0_rtx);
> model_rtx = operands[5];
> scratch = operands[7];
> mode = GET_MODE (mem);
> model = memmodel_from_int (INTVAL (model_rtx));
> + is_weak = operands[4] != const0_rtx && mode != TImode;
>
> /* When OLDVAL is zero and we want the strong version we can emit a tighter
> loop:
> @@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
> else
> aarch64_gen_compare_reg (NE, scratch, const0_rtx);
>
> + /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
> + store the returned value and loop if the STLXP fails. */
> + if (mode == TImode)
> + {
> + rtx_code_label *label3 = gen_label_rtx ();
> + emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
> + emit_barrier ();
> +
> + emit_label (label2);
> + aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
> +
> + if (aarch64_track_speculation)
> + {
> + /* Emit an explicit compare instruction, so that we can correctly
> + track the condition codes. */
> + rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
> + x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
> + }
> + else
> + x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
> + x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> + gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
> + aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +
> + label2 = label3;
> + }
> +
> emit_label (label2);
>
> /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
> diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
> index dde3a28e07b13669533dfc5e8fac0a9a6ac33dbd..ba05047ff02b6fc5752235bffa924fc4a2f48c04 100644
> --- a/libgcc/config/aarch64/lse.S
> +++ b/libgcc/config/aarch64/lse.S
> @@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
> #define tmp0 16
> #define tmp1 17
> #define tmp2 15
> +#define tmp3 14
> +#define tmp4 13
>
> #define BTI_C hint 34
>
> @@ -233,10 +235,11 @@ STARTFN NAME(cas)
> 0: LDXP x0, x1, [x4]
> cmp x0, x(tmp0)
> ccmp x1, x(tmp1), #0, eq
> - bne 1f
> - STXP w(tmp2), x2, x3, [x4]
> - cbnz w(tmp2), 0b
> -1: BARRIER
> + csel x(tmp2), x2, x0, eq
> + csel x(tmp3), x3, x1, eq
> + STXP w(tmp4), x(tmp2), x(tmp3), [x4]
> + cbnz w(tmp4), 0b
> + BARRIER
> ret
>
> #endif
Hi Richard,
Thanks for the review, now committed.
> The new aarch64_split_compare_and_swap code looks a bit twisty.
> The approach in lse.S seems more obvious. But I'm guessing you
> didn't want to spend any time restructuring the pre-LSE
> -mno-outline-atomics code, and I agree the patch in its current
> form is more backportable.
Indeed this code needs cleaning up - all the complex speculation stuff
should be behind a simple interface. I was thinking of emitting CSEL
here but it would require adding new TI mode patterns or manually
splitting into low/high parts and emitting CSEL.
> I suppose we might want to backport this after it has been in trunk
> for a bit.
Yes that was my plan.
Cheers,
Wilco
@@ -23413,11 +23413,11 @@ aarch64_split_compare_and_swap (rtx operands[])
mem = operands[1];
oldval = operands[2];
newval = operands[3];
- is_weak = (operands[4] != const0_rtx);
model_rtx = operands[5];
scratch = operands[7];
mode = GET_MODE (mem);
model = memmodel_from_int (INTVAL (model_rtx));
+ is_weak = operands[4] != const0_rtx && mode != TImode;
/* When OLDVAL is zero and we want the strong version we can emit a tighter
loop:
@@ -23478,6 +23478,33 @@ aarch64_split_compare_and_swap (rtx operands[])
else
aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ /* 128-bit LDAXP is not atomic unless STLXP succeeds. So for a mismatch,
+ store the returned value and loop if the STLXP fails. */
+ if (mode == TImode)
+ {
+ rtx_code_label *label3 = gen_label_rtx ();
+ emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
+ emit_barrier ();
+
+ emit_label (label2);
+ aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
+
+ if (aarch64_track_speculation)
+ {
+ /* Emit an explicit compare instruction, so that we can correctly
+ track the condition codes. */
+ rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
+ x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
+ }
+ else
+ x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
+ x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+ gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+ aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+
+ label2 = label3;
+ }
+
emit_label (label2);
/* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
@@ -160,6 +160,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#define tmp0 16
#define tmp1 17
#define tmp2 15
+#define tmp3 14
+#define tmp4 13
#define BTI_C hint 34
@@ -233,10 +235,11 @@ STARTFN NAME(cas)
0: LDXP x0, x1, [x4]
cmp x0, x(tmp0)
ccmp x1, x(tmp1), #0, eq
- bne 1f
- STXP w(tmp2), x2, x3, [x4]
- cbnz w(tmp2), 0b
-1: BARRIER
+ csel x(tmp2), x2, x0, eq
+ csel x(tmp3), x3, x1, eq
+ STXP w(tmp4), x(tmp2), x(tmp3), [x4]
+ cbnz w(tmp4), 0b
+ BARRIER
ret
#endif