libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]

Message ID PAWPR08MB898223DE66773176183922A9834EA@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers
Series libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061] |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Testing passed

Commit Message

Wilco Dijkstra June 2, 2023, 5:28 p.m. UTC
  Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
        PR target/110061
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---
  

Comments

Wilco Dijkstra June 16, 2023, 12:17 p.m. UTC | #1
ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
        PR target/110061
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
         .arch   armv8-a+lse
 
 #define ENTRY(name)             \
@@ -37,6 +52,10 @@ name:                                \
         .cfi_endproc;           \
         .size name, .-name;
 
+#define ALIAS(alias,name)      \
+       .global alias;          \
+       .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:                                \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+       mov     x5, x0
+       cbnz    w1, 2f
+
+       /* RELAXED.  */
+1:     ldxp    res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 1b
+       ret
+
+       /* ACQUIRE/CONSUME/SEQ_CST.  */
+2:     ldaxp   res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 2b
+       ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
         cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+       cbnz    w4, 2f
+
+       /* RELAXED.  */
+1:     ldxp    xzr, tmp0, [x0]
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       ret
+
+       /* RELEASE/SEQ_CST.  */
+2:     ldxp    xzr, tmp0, [x0]
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 2b
+       ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
         cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
         ret
 
         /* RELEASE/SEQ_CST.  */
-1:     ldaxp   xzr, tmp0, [x0]
+1:     ldxp    xzr, tmp0, [x0]
         stlxp   w4, in0, in1, [x0]
         cbnz    w4, 1b
         ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
         stxp    w4, in0, in1, [x5]
         cbnz    w4, 3b
         ret
-4:
-       cmp     w4, RELEASE
-       b.ne    6f
 
-       /* RELEASE.  */
-5:     ldxp    res0, res1, [x5]
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   res0, res1, [x5]
         stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 5b
+       cbnz    w4, 4b
         ret
+END (libat_exchange_16)
 
-       /* ACQ_REL/SEQ_CST.  */
-6:     ldaxp   res0, res1, [x5]
-       stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+       ldp     exp0, exp1, [x1]
+       cbz     w4, 3f
+       cmp     w4, RELEASE
+       b.hs    4f
+
+       /* ACQUIRE/CONSUME.  */
+1:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2f
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       mov     x0, 1
         ret
-END (libat_exchange_16_i1)
+
+2:     stp     tmp0, tmp1, [x1]
+       mov     x0, 0
+       ret
+
+       /* RELAXED.  */
+3:     ldxp    tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 3b
+       mov     x0, 1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 4b
+       mov     x0, 1
+       ret
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-       mov     w2, 1
-       cbnz    w1, 2f
-
-       /* RELAXED.  */
-       swpb    w0, w2, [x0]
-       ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:     swpalb  w0, w2, [x0]
+       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+       mov     x5, x0
+1:     ldaxrb  w0, [x5]
+       stlxrb  w4, w2, [x5]
+       cbnz    w4, 1b
         ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,10 +35,19 @@
 #endif
 #define IFUNC_NCOND(N)  (1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16            1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16       1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
 #include_next <host-config.h>
  
Wilco Dijkstra July 5, 2023, 5:19 p.m. UTC | #2
ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
        PR target/110061
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
         .arch   armv8-a+lse
 
 #define ENTRY(name)             \
@@ -37,6 +52,10 @@ name:                                \
         .cfi_endproc;           \
         .size name, .-name;
 
+#define ALIAS(alias,name)      \
+       .global alias;          \
+       .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:                                \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+       mov     x5, x0
+       cbnz    w1, 2f
+
+       /* RELAXED.  */
+1:     ldxp    res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 1b
+       ret
+
+       /* ACQUIRE/CONSUME/SEQ_CST.  */
+2:     ldaxp   res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 2b
+       ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
         cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+       cbnz    w4, 2f
+
+       /* RELAXED.  */
+1:     ldxp    xzr, tmp0, [x0]
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       ret
+
+       /* RELEASE/SEQ_CST.  */
+2:     ldxp    xzr, tmp0, [x0]
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 2b
+       ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
         cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
         ret
 
         /* RELEASE/SEQ_CST.  */
-1:     ldaxp   xzr, tmp0, [x0]
+1:     ldxp    xzr, tmp0, [x0]
         stlxp   w4, in0, in1, [x0]
         cbnz    w4, 1b
         ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
         stxp    w4, in0, in1, [x5]
         cbnz    w4, 3b
         ret
-4:
-       cmp     w4, RELEASE
-       b.ne    6f
 
-       /* RELEASE.  */
-5:     ldxp    res0, res1, [x5]
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   res0, res1, [x5]
         stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 5b
+       cbnz    w4, 4b
         ret
+END (libat_exchange_16)
 
-       /* ACQ_REL/SEQ_CST.  */
-6:     ldaxp   res0, res1, [x5]
-       stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+       ldp     exp0, exp1, [x1]
+       cbz     w4, 3f
+       cmp     w4, RELEASE
+       b.hs    4f
+
+       /* ACQUIRE/CONSUME.  */
+1:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2f
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       mov     x0, 1
         ret
-END (libat_exchange_16_i1)
+
+2:     stp     tmp0, tmp1, [x1]
+       mov     x0, 0
+       ret
+
+       /* RELAXED.  */
+3:     ldxp    tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 3b
+       mov     x0, 1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 4b
+       mov     x0, 1
+       ret
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-       mov     w2, 1
-       cbnz    w1, 2f
-
-       /* RELAXED.  */
-       swpb    w0, w2, [x0]
-       ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:     swpalb  w0, w2, [x0]
+       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+       mov     x5, x0
+1:     ldaxrb  w0, [x5]
+       stlxrb  w4, w2, [x5]
+       cbnz    w4, 1b
         ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,10 +35,19 @@
 #endif
 #define IFUNC_NCOND(N)  (1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16            1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16       1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
 #include_next <host-config.h>
  
Wilco Dijkstra Aug. 4, 2023, 3:07 p.m. UTC | #3
ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
        PR target/110061
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
         .arch   armv8-a+lse
 
 #define ENTRY(name)             \
@@ -37,6 +52,10 @@ name:                                \
         .cfi_endproc;           \
         .size name, .-name;
 
+#define ALIAS(alias,name)      \
+       .global alias;          \
+       .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:                                \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+       mov     x5, x0
+       cbnz    w1, 2f
+
+       /* RELAXED.  */
+1:     ldxp    res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 1b
+       ret
+
+       /* ACQUIRE/CONSUME/SEQ_CST.  */
+2:     ldaxp   res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 2b
+       ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
         cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+       cbnz    w4, 2f
+
+       /* RELAXED.  */
+1:     ldxp    xzr, tmp0, [x0]
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       ret
+
+       /* RELEASE/SEQ_CST.  */
+2:     ldxp    xzr, tmp0, [x0]
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 2b
+       ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
         cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
         ret
 
         /* RELEASE/SEQ_CST.  */
-1:     ldaxp   xzr, tmp0, [x0]
+1:     ldxp    xzr, tmp0, [x0]
         stlxp   w4, in0, in1, [x0]
         cbnz    w4, 1b
         ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
         stxp    w4, in0, in1, [x5]
         cbnz    w4, 3b
         ret
-4:
-       cmp     w4, RELEASE
-       b.ne    6f
 
-       /* RELEASE.  */
-5:     ldxp    res0, res1, [x5]
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   res0, res1, [x5]
         stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 5b
+       cbnz    w4, 4b
         ret
+END (libat_exchange_16)
 
-       /* ACQ_REL/SEQ_CST.  */
-6:     ldaxp   res0, res1, [x5]
-       stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+       ldp     exp0, exp1, [x1]
+       cbz     w4, 3f
+       cmp     w4, RELEASE
+       b.hs    4f
+
+       /* ACQUIRE/CONSUME.  */
+1:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2f
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       mov     x0, 1
         ret
-END (libat_exchange_16_i1)
+
+2:     stp     tmp0, tmp1, [x1]
+       mov     x0, 0
+       ret
+
+       /* RELAXED.  */
+3:     ldxp    tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 3b
+       mov     x0, 1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 4b
+       mov     x0, 1
+       ret
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-       mov     w2, 1
-       cbnz    w1, 2f
-
-       /* RELAXED.  */
-       swpb    w0, w2, [x0]
-       ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:     swpalb  w0, w2, [x0]
+       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+       mov     x5, x0
+1:     ldaxrb  w0, [x5]
+       stlxrb  w4, w2, [x5]
+       cbnz    w4, 1b
         ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,10 +35,19 @@
 #endif
 #define IFUNC_NCOND(N)  (1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16            1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16       1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
 #include_next <host-config.h>
  
Wilco Dijkstra Sept. 13, 2023, 2:57 p.m. UTC | #4
ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
        PR target/110061
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
         .arch   armv8-a+lse
 
 #define ENTRY(name)             \
@@ -37,6 +52,10 @@ name:                                \
         .cfi_endproc;           \
         .size name, .-name;
 
+#define ALIAS(alias,name)      \
+       .global alias;          \
+       .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:                                \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+       mov     x5, x0
+       cbnz    w1, 2f
+
+       /* RELAXED.  */
+1:     ldxp    res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 1b
+       ret
+
+       /* ACQUIRE/CONSUME/SEQ_CST.  */
+2:     ldaxp   res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 2b
+       ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
         cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+       cbnz    w4, 2f
+
+       /* RELAXED.  */
+1:     ldxp    xzr, tmp0, [x0]
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       ret
+
+       /* RELEASE/SEQ_CST.  */
+2:     ldxp    xzr, tmp0, [x0]
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 2b
+       ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
         cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
         ret
 
         /* RELEASE/SEQ_CST.  */
-1:     ldaxp   xzr, tmp0, [x0]
+1:     ldxp    xzr, tmp0, [x0]
         stlxp   w4, in0, in1, [x0]
         cbnz    w4, 1b
         ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
         stxp    w4, in0, in1, [x5]
         cbnz    w4, 3b
         ret
-4:
-       cmp     w4, RELEASE
-       b.ne    6f
 
-       /* RELEASE.  */
-5:     ldxp    res0, res1, [x5]
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   res0, res1, [x5]
         stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 5b
+       cbnz    w4, 4b
         ret
+END (libat_exchange_16)
 
-       /* ACQ_REL/SEQ_CST.  */
-6:     ldaxp   res0, res1, [x5]
-       stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+       ldp     exp0, exp1, [x1]
+       cbz     w4, 3f
+       cmp     w4, RELEASE
+       b.hs    4f
+
+       /* ACQUIRE/CONSUME.  */
+1:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2f
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       mov     x0, 1
         ret
-END (libat_exchange_16_i1)
+
+2:     stp     tmp0, tmp1, [x1]
+       mov     x0, 0
+       ret
+
+       /* RELAXED.  */
+3:     ldxp    tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 3b
+       mov     x0, 1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 4b
+       mov     x0, 1
+       ret
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-       mov     w2, 1
-       cbnz    w1, 2f
-
-       /* RELAXED.  */
-       swpb    w0, w2, [x0]
-       ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:     swpalb  w0, w2, [x0]
+       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+       mov     x5, x0
+1:     ldaxrb  w0, [x5]
+       stlxrb  w4, w2, [x5]
+       cbnz    w4, 1b
         ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,10 +35,19 @@
 #endif
 #define IFUNC_NCOND(N)  (1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16            1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16       1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
 #include_next <host-config.h>
  
Wilco Dijkstra Oct. 16, 2023, 12:50 p.m. UTC | #5
ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
        PR target/110061
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
         .arch   armv8-a+lse
 
 #define ENTRY(name)             \
@@ -37,6 +52,10 @@ name:                                \
         .cfi_endproc;           \
         .size name, .-name;
 
+#define ALIAS(alias,name)      \
+       .global alias;          \
+       .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:                                \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+       mov     x5, x0
+       cbnz    w1, 2f
+
+       /* RELAXED.  */
+1:     ldxp    res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 1b
+       ret
+
+       /* ACQUIRE/CONSUME/SEQ_CST.  */
+2:     ldaxp   res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 2b
+       ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
         cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+       cbnz    w4, 2f
+
+       /* RELAXED.  */
+1:     ldxp    xzr, tmp0, [x0]
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       ret
+
+       /* RELEASE/SEQ_CST.  */
+2:     ldxp    xzr, tmp0, [x0]
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 2b
+       ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
         cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
         ret
 
         /* RELEASE/SEQ_CST.  */
-1:     ldaxp   xzr, tmp0, [x0]
+1:     ldxp    xzr, tmp0, [x0]
         stlxp   w4, in0, in1, [x0]
         cbnz    w4, 1b
         ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
         stxp    w4, in0, in1, [x5]
         cbnz    w4, 3b
         ret
-4:
-       cmp     w4, RELEASE
-       b.ne    6f
 
-       /* RELEASE.  */
-5:     ldxp    res0, res1, [x5]
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   res0, res1, [x5]
         stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 5b
+       cbnz    w4, 4b
         ret
+END (libat_exchange_16)
 
-       /* ACQ_REL/SEQ_CST.  */
-6:     ldaxp   res0, res1, [x5]
-       stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+       ldp     exp0, exp1, [x1]
+       cbz     w4, 3f
+       cmp     w4, RELEASE
+       b.hs    4f
+
+       /* ACQUIRE/CONSUME.  */
+1:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2f
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       mov     x0, 1
         ret
-END (libat_exchange_16_i1)
+
+2:     stp     tmp0, tmp1, [x1]
+       mov     x0, 0
+       ret
+
+       /* RELAXED.  */
+3:     ldxp    tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 3b
+       mov     x0, 1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 4b
+       mov     x0, 1
+       ret
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-       mov     w2, 1
-       cbnz    w1, 2f
-
-       /* RELAXED.  */
-       swpb    w0, w2, [x0]
-       ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:     swpalb  w0, w2, [x0]
+       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+       mov     x5, x0
+1:     ldaxrb  w0, [x5]
+       stlxrb  w4, w2, [x5]
+       cbnz    w4, 1b
         ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,10 +35,19 @@
 #endif
 #define IFUNC_NCOND(N)  (1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16            1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16       1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
 #include_next <host-config.h>
  
Wilco Dijkstra Nov. 6, 2023, 12:13 p.m. UTC | #6
ping

From: Wilco Dijkstra
Sent: 02 June 2023 18:28
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061] 
 

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries, gives better performance than locking atomics and is what
most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the given
address is writeable (which will be true when using atomics in actual code).

A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.

Passes regress, OK for commit?

libatomic/
        PR target/110061
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
         .arch   armv8-a+lse
 
 #define ENTRY(name)             \
@@ -37,6 +52,10 @@ name:                                \
         .cfi_endproc;           \
         .size name, .-name;
 
+#define ALIAS(alias,name)      \
+       .global alias;          \
+       .set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@ name:                                \
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+       mov     x5, x0
+       cbnz    w1, 2f
+
+       /* RELAXED.  */
+1:     ldxp    res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 1b
+       ret
+
+       /* ACQUIRE/CONSUME/SEQ_CST.  */
+2:     ldaxp   res0, res1, [x5]
+       stxp    w4, res0, res1, [x5]
+       cbnz    w4, 2b
+       ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
         cbnz    w1, 1f
 
@@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+       cbnz    w4, 2f
+
+       /* RELAXED.  */
+1:     ldxp    xzr, tmp0, [x0]
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       ret
+
+       /* RELEASE/SEQ_CST.  */
+2:     ldxp    xzr, tmp0, [x0]
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 2b
+       ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
         cbnz    w4, 1f
 
@@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
         ret
 
         /* RELEASE/SEQ_CST.  */
-1:     ldaxp   xzr, tmp0, [x0]
+1:     ldxp    xzr, tmp0, [x0]
         stlxp   w4, in0, in1, [x0]
         cbnz    w4, 1b
         ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
         stxp    w4, in0, in1, [x5]
         cbnz    w4, 3b
         ret
-4:
-       cmp     w4, RELEASE
-       b.ne    6f
 
-       /* RELEASE.  */
-5:     ldxp    res0, res1, [x5]
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   res0, res1, [x5]
         stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 5b
+       cbnz    w4, 4b
         ret
+END (libat_exchange_16)
 
-       /* ACQ_REL/SEQ_CST.  */
-6:     ldaxp   res0, res1, [x5]
-       stlxp   w4, in0, in1, [x5]
-       cbnz    w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+       ldp     exp0, exp1, [x1]
+       cbz     w4, 3f
+       cmp     w4, RELEASE
+       b.hs    4f
+
+       /* ACQUIRE/CONSUME.  */
+1:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2f
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 1b
+       mov     x0, 1
         ret
-END (libat_exchange_16_i1)
+
+2:     stp     tmp0, tmp1, [x1]
+       mov     x0, 0
+       ret
+
+       /* RELAXED.  */
+3:     ldxp    tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stxp    w4, in0, in1, [x0]
+       cbnz    w4, 3b
+       mov     x0, 1
+       ret
+
+       /* RELEASE/ACQ_REL/SEQ_CST.  */
+4:     ldaxp   tmp0, tmp1, [x0]
+       cmp     tmp0, exp0
+       ccmp    tmp1, exp1, 0, eq
+       bne     2b
+       stlxp   w4, in0, in1, [x0]
+       cbnz    w4, 4b
+       mov     x0, 1
+       ret
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
         mov     x5, x0
         cbnz    w4, 2f
 
@@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
         stlxp   w4, tmp0, tmp1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
         mov     x5, x0
         mvn     in0, in0
         mvn     in1, in1
@@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
         stlxp   w4, res0, res1, [x5]
         cbnz    w4, 2b
         ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-       mov     w2, 1
-       cbnz    w1, 2f
-
-       /* RELAXED.  */
-       swpb    w0, w2, [x0]
-       ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:     swpalb  w0, w2, [x0]
+       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+       mov     x5, x0
+1:     ldaxrb  w0, [x5]
+       stlxrb  w4, w2, [x5]
+       cbnz    w4, 1b
         ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,10 +35,19 @@
 #endif
 #define IFUNC_NCOND(N)  (1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16            1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16       1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
 
 #include_next <host-config.h>
  
Richard Sandiford Nov. 29, 2023, 11:46 p.m. UTC | #7
Not my specialist subject, but here goes anyway:

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> ping
>
> From: Wilco Dijkstra
> Sent: 02 June 2023 18:28
> To: GCC Patches <gcc-patches@gcc.gnu.org>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH] libatomic: Enable lock-free 128-bit atomics on AArch64 [PR110061]
>
>
> Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
> existing binaries, gives better performance than locking atomics and is what
> most users expect.

Please add a justification for why it's backwards compatible, rather
than just stating that it's so.

> Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
> This results in an implicit store which is invisible to software as long as the given
> address is writeable (which will be true when using atomics in actual code).

Thanks for adding this.  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95722
suggests that it's still an open question whether this is a correct thing
to do, but it sounds from Joseph's comment that he isn't sure whether
atomic loads from read-only data are valid.

Linus's comment in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70490
suggests that a reasonable compromise might be to use a storing
implementation but not advertise that it is lock-free.  Also,
the comment above libat_is_lock_free says:

/* Note that this can return that a size/alignment is not lock-free even if
   all the operations that we use to implement the respective accesses provide
   lock-free forward progress as specified in C++14:  Users likely expect
   "lock-free" to also mean "fast", which is why we do not return true if, for
   example, we implement loads with this size/alignment using a CAS.  */

We don't use a CAS for the fallbacks, but like you say, we do use a
load/store exclusive loop.  So did you consider not doing this:

> +/* State we have lock-free 128-bit atomics.  */
> +#undef FAST_ATOMIC_LDST_16
> +#define FAST_ATOMIC_LDST_16            1

?

Otherwise it looks reasonable to me, for whatever that's worth, but:

> A simple test on an old Cortex-A72 showed 2.7x speedup of 128-bit atomics.
>
> Passes regress, OK for commit?
>
> libatomic/
>         PR target/110061
>         config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
>         config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
>         State we have lock-free atomics.
>
> ---
>
> diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
> index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
> --- a/libatomic/config/linux/aarch64/atomic_16.S
> +++ b/libatomic/config/linux/aarch64/atomic_16.S
> @@ -22,6 +22,21 @@
>     <http://www.gnu.org/licenses/>.  */
>
>
> +/* AArch64 128-bit lock-free atomic implementation.
> +
> +   128-bit atomics are now lock-free for all AArch64 architecture versions.
> +   This is backwards compatible with existing binaries and gives better
> +   performance than locking atomics.
> +
> +   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
> +   This results in an implicit store which is invisible to software as long
> +   as the given address is writeable.  Since all other atomics have explicit
> +   writes, this will be true when using atomics in actual code.
> +
> +   The libat_<op>_16 entry points are ARMv8.0.
> +   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
> +
> +
>          .arch   armv8-a+lse
>
>  #define ENTRY(name)             \
> @@ -37,6 +52,10 @@ name:                                \
>          .cfi_endproc;           \
>          .size name, .-name;
>
> +#define ALIAS(alias,name)      \
> +       .global alias;          \
> +       .set alias, name;
> +
>  #define res0 x0
>  #define res1 x1
>  #define in0  x2
> @@ -70,6 +89,24 @@ name:                                \
>  #define SEQ_CST 5
>
>
> +ENTRY (libat_load_16)
> +       mov     x5, x0
> +       cbnz    w1, 2f
> +
> +       /* RELAXED.  */
> +1:     ldxp    res0, res1, [x5]
> +       stxp    w4, res0, res1, [x5]
> +       cbnz    w4, 1b
> +       ret
> +
> +       /* ACQUIRE/CONSUME/SEQ_CST.  */
> +2:     ldaxp   res0, res1, [x5]
> +       stxp    w4, res0, res1, [x5]
> +       cbnz    w4, 2b
> +       ret
> +END (libat_load_16)
> +
> +
>  ENTRY (libat_load_16_i1)
>          cbnz    w1, 1f
>
> @@ -93,6 +130,23 @@ ENTRY (libat_load_16_i1)
>  END (libat_load_16_i1)
>
>
> +ENTRY (libat_store_16)
> +       cbnz    w4, 2f
> +
> +       /* RELAXED.  */
> +1:     ldxp    xzr, tmp0, [x0]
> +       stxp    w4, in0, in1, [x0]
> +       cbnz    w4, 1b
> +       ret
> +
> +       /* RELEASE/SEQ_CST.  */
> +2:     ldxp    xzr, tmp0, [x0]
> +       stlxp   w4, in0, in1, [x0]
> +       cbnz    w4, 2b
> +       ret
> +END (libat_store_16)
> +
> +
>  ENTRY (libat_store_16_i1)
>          cbnz    w4, 1f
>
> @@ -101,14 +155,14 @@ ENTRY (libat_store_16_i1)
>          ret
>
>          /* RELEASE/SEQ_CST.  */
> -1:     ldaxp   xzr, tmp0, [x0]
> +1:     ldxp    xzr, tmp0, [x0]
>          stlxp   w4, in0, in1, [x0]
>          cbnz    w4, 1b
>          ret
>  END (libat_store_16_i1)
>
>
> -ENTRY (libat_exchange_16_i1)
> +ENTRY (libat_exchange_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -126,22 +180,55 @@ ENTRY (libat_exchange_16_i1)
>          stxp    w4, in0, in1, [x5]
>          cbnz    w4, 3b
>          ret
> -4:
> -       cmp     w4, RELEASE
> -       b.ne    6f
>
> -       /* RELEASE.  */
> -5:     ldxp    res0, res1, [x5]
> +       /* RELEASE/ACQ_REL/SEQ_CST.  */
> +4:     ldaxp   res0, res1, [x5]
>          stlxp   w4, in0, in1, [x5]
> -       cbnz    w4, 5b
> +       cbnz    w4, 4b
>          ret
> +END (libat_exchange_16)

Please explain (here and in the commit message) why you're adding
acquire semantics to the RELEASE case.

Thanks,
Richard

>
> -       /* ACQ_REL/SEQ_CST.  */
> -6:     ldaxp   res0, res1, [x5]
> -       stlxp   w4, in0, in1, [x5]
> -       cbnz    w4, 6b
> +
> +ENTRY (libat_compare_exchange_16)
> +       ldp     exp0, exp1, [x1]
> +       cbz     w4, 3f
> +       cmp     w4, RELEASE
> +       b.hs    4f
> +
> +       /* ACQUIRE/CONSUME.  */
> +1:     ldaxp   tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       bne     2f
> +       stxp    w4, in0, in1, [x0]
> +       cbnz    w4, 1b
> +       mov     x0, 1
>          ret
> -END (libat_exchange_16_i1)
> +
> +2:     stp     tmp0, tmp1, [x1]
> +       mov     x0, 0
> +       ret
> +
> +       /* RELAXED.  */
> +3:     ldxp    tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       bne     2b
> +       stxp    w4, in0, in1, [x0]
> +       cbnz    w4, 3b
> +       mov     x0, 1
> +       ret
> +
> +       /* RELEASE/ACQ_REL/SEQ_CST.  */
> +4:     ldaxp   tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       bne     2b
> +       stlxp   w4, in0, in1, [x0]
> +       cbnz    w4, 4b
> +       mov     x0, 1
> +       ret
> +END (libat_compare_exchange_16)
>
>
>  ENTRY (libat_compare_exchange_16_i1)
> @@ -180,7 +267,7 @@ ENTRY (libat_compare_exchange_16_i1)
>  END (libat_compare_exchange_16_i1)
>
>
> -ENTRY (libat_fetch_add_16_i1)
> +ENTRY (libat_fetch_add_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -199,10 +286,10 @@ ENTRY (libat_fetch_add_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_add_16_i1)
> +END (libat_fetch_add_16)
>
>
> -ENTRY (libat_add_fetch_16_i1)
> +ENTRY (libat_add_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -221,10 +308,10 @@ ENTRY (libat_add_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_add_fetch_16_i1)
> +END (libat_add_fetch_16)
>
>
> -ENTRY (libat_fetch_sub_16_i1)
> +ENTRY (libat_fetch_sub_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -243,10 +330,10 @@ ENTRY (libat_fetch_sub_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_sub_16_i1)
> +END (libat_fetch_sub_16)
>
>
> -ENTRY (libat_sub_fetch_16_i1)
> +ENTRY (libat_sub_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -265,10 +352,10 @@ ENTRY (libat_sub_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_sub_fetch_16_i1)
> +END (libat_sub_fetch_16)
>
>
> -ENTRY (libat_fetch_or_16_i1)
> +ENTRY (libat_fetch_or_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -287,10 +374,10 @@ ENTRY (libat_fetch_or_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_or_16_i1)
> +END (libat_fetch_or_16)
>
>
> -ENTRY (libat_or_fetch_16_i1)
> +ENTRY (libat_or_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -309,10 +396,10 @@ ENTRY (libat_or_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_or_fetch_16_i1)
> +END (libat_or_fetch_16)
>
>
> -ENTRY (libat_fetch_and_16_i1)
> +ENTRY (libat_fetch_and_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -331,10 +418,10 @@ ENTRY (libat_fetch_and_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_and_16_i1)
> +END (libat_fetch_and_16)
>
>
> -ENTRY (libat_and_fetch_16_i1)
> +ENTRY (libat_and_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -353,10 +440,10 @@ ENTRY (libat_and_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_and_fetch_16_i1)
> +END (libat_and_fetch_16)
>
>
> -ENTRY (libat_fetch_xor_16_i1)
> +ENTRY (libat_fetch_xor_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -375,10 +462,10 @@ ENTRY (libat_fetch_xor_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_xor_16_i1)
> +END (libat_fetch_xor_16)
>
>
> -ENTRY (libat_xor_fetch_16_i1)
> +ENTRY (libat_xor_fetch_16)
>          mov     x5, x0
>          cbnz    w4, 2f
>
> @@ -397,10 +484,10 @@ ENTRY (libat_xor_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_xor_fetch_16_i1)
> +END (libat_xor_fetch_16)
>
>
> -ENTRY (libat_fetch_nand_16_i1)
> +ENTRY (libat_fetch_nand_16)
>          mov     x5, x0
>          mvn     in0, in0
>          mvn     in1, in1
> @@ -421,10 +508,10 @@ ENTRY (libat_fetch_nand_16_i1)
>          stlxp   w4, tmp0, tmp1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_fetch_nand_16_i1)
> +END (libat_fetch_nand_16)
>
>
> -ENTRY (libat_nand_fetch_16_i1)
> +ENTRY (libat_nand_fetch_16)
>          mov     x5, x0
>          mvn     in0, in0
>          mvn     in1, in1
> @@ -445,21 +532,38 @@ ENTRY (libat_nand_fetch_16_i1)
>          stlxp   w4, res0, res1, [x5]
>          cbnz    w4, 2b
>          ret
> -END (libat_nand_fetch_16_i1)
> +END (libat_nand_fetch_16)
>
>
> -ENTRY (libat_test_and_set_16_i1)
> -       mov     w2, 1
> -       cbnz    w1, 2f
> -
> -       /* RELAXED.  */
> -       swpb    w0, w2, [x0]
> -       ret
> +/* __atomic_test_and_set is always inlined, so this entry is unused and
> +   only required for completeness.  */
> +ENTRY (libat_test_and_set_16)
>
> -       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
> -2:     swpalb  w0, w2, [x0]
> +       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
> +       mov     x5, x0
> +1:     ldaxrb  w0, [x5]
> +       stlxrb  w4, w2, [x5]
> +       cbnz    w4, 1b
>          ret
> -END (libat_test_and_set_16_i1)
> +END (libat_test_and_set_16)
> +
> +
> +/* Alias entry points which are the same in baseline and LSE2.  */
> +
> +ALIAS (libat_exchange_16_i1, libat_exchange_16)
> +ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
> +ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
> +ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
> +ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
> +ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
> +ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
> +ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
> +ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
> +ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
> +ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
> +ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
> +ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
> +ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
>
>
>  /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
> diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
> index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
> --- a/libatomic/config/linux/aarch64/host-config.h
> +++ b/libatomic/config/linux/aarch64/host-config.h
> @@ -35,10 +35,19 @@
>  #endif
>  #define IFUNC_NCOND(N)  (1)
>
> -#if N == 16 && IFUNC_ALT != 0
> +#endif /* HAVE_IFUNC */
> +
> +/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
> +#if N == 16
>  # define DONE 1
>  #endif
>
> -#endif /* HAVE_IFUNC */
> +/* State we have lock-free 128-bit atomics.  */
> +#undef FAST_ATOMIC_LDST_16
> +#define FAST_ATOMIC_LDST_16            1
> +#undef MAYBE_HAVE_ATOMIC_CAS_16
> +#define MAYBE_HAVE_ATOMIC_CAS_16       1
> +#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
> +#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
>
>  #include_next <host-config.h>
  
Wilco Dijkstra Dec. 4, 2023, 6:22 p.m. UTC | #8
Hi Richard,

>> Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
>> existing binaries, gives better performance than locking atomics and is what
>> most users expect.
>
> Please add a justification for why it's backwards compatible, rather
> than just stating that it's so.

This isn't any different than the LSE2 support which also switches some CPUs to
lock-free implementations. This is basically switching the rest. It trivially follows
from the fact that GCC always calls libatomic so that you switch all atomics in a
process. I'll add that to the description.

Note the compatibility story is even better than this. We are also compatible
with LLVM and future GCC versions which may inline these sequences.

> Thanks for adding this.  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95722
> suggests that it's still an open question whether this is a correct thing
> to do, but it sounds from Joseph's comment that he isn't sure whether
> atomic loads from read-only data are valid.

Yes it's not useful to do an atomic read if it is a read-only value... It should
be feasible to mark atomic types as mutable to force them to .data (see eg.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108659 and
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109553).

> Linus's comment in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70490
> suggests that a reasonable compromise might be to use a storing
> implementation but not advertise that it is lock-free.  Also,
> the comment above libat_is_lock_free says:
>
> /* Note that this can return that a size/alignment is not lock-free even if
>    all the operations that we use to implement the respective accesses provide
>    lock-free forward progress as specified in C++14:  Users likely expect
>    "lock-free" to also mean "fast", which is why we do not return true if, for
>    example, we implement loads with this size/alignment using a CAS.  */

I don't believe lying about being lock-free like that is a good idea. When
you use a faster lock-free implementation, you want to tell users about it
(so they aren't forced to use nasty inline assembler hacks for example).

> We don't use a CAS for the fallbacks, but like you say, we do use a
> load/store exclusive loop.  So did you consider not doing this:

> +/* State we have lock-free 128-bit atomics.  */
> +#undef FAST_ATOMIC_LDST_16
> +#define FAST_ATOMIC_LDST_16            1

That would result in __atomic_is_lock_free incorrectly returning false.
Note that __atomic_always_lock_free remains false for 128-bit since there
is no inlining in the compiler, but __atomic_is_lock_free should be true.

> -       /* RELEASE.  */
> -5:     ldxp    res0, res1, [x5]
> +       /* RELEASE/ACQ_REL/SEQ_CST.  */
> +4:     ldaxp   res0, res1, [x5]
>          stlxp   w4, in0, in1, [x5]
> -       cbnz    w4, 5b
> +       cbnz    w4, 4b
>          ret
> +END (libat_exchange_16)

> Please explain (here and in the commit message) why you're adding
> acquire semantics to the RELEASE case.

That merges the RELEASE with ACQ_REL/SEQ_CST cases to keep the code
short and simple like much of the code. I've added a note in the commit msg.

Cheers,
Wilco

Here is v2 - this also incorporates the PR111404 fix to compare-exchange:

Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
existing binaries (as for these GCC always calls into libatomic, so all 128-bit
atomic uses in  a process are switched), gives better performance than locking
atomics and is what most users expect.

Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
This results in an implicit store which is invisible to software as long as the
given address is writeable (which will be true when using atomics in actual code).

Passes regress, OK for commit?

libatomic/
        config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
        (libat_exchange_16): Merge RELEASE and ACQ_REL/SEQ_CST cases.
        config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
        State we have lock-free atomics.

---

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..a099037179b3f1210145baea02a9d43418629813 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,22 @@
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries (as we swap all uses
+   of 128-bit atomics via an ifunc) and gives better performance than locking
+   atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
 	.arch	armv8-a+lse
 
 #define ENTRY(name)		\
@@ -37,6 +53,10 @@ name:				\
 	.cfi_endproc;		\
 	.size name, .-name;
 
+#define ALIAS(alias,name)	\
+	.global alias;		\
+	.set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +90,24 @@ name:				\
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+	mov	x5, x0
+	cbnz	w1, 2f
+
+	/* RELAXED.  */
+1:	ldxp	res0, res1, [x5]
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+
+	/* ACQUIRE/CONSUME/SEQ_CST.  */
+2:	ldaxp	res0, res1, [x5]
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 	cbnz	w1, 1f
 
@@ -93,6 +131,23 @@ ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+	cbnz	w4, 2f
+
+	/* RELAXED.  */
+1:	ldxp	xzr, tmp0, [x0]
+	stxp	w4, in0, in1, [x0]
+	cbnz	w4, 1b
+	ret
+
+	/* RELEASE/SEQ_CST.  */
+2:	ldxp	xzr, tmp0, [x0]
+	stlxp	w4, in0, in1, [x0]
+	cbnz	w4, 2b
+	ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 	cbnz	w4, 1f
 
@@ -101,14 +156,14 @@ ENTRY (libat_store_16_i1)
 	ret
 
 	/* RELEASE/SEQ_CST.  */
-1:	ldaxp	xzr, tmp0, [x0]
+1:	ldxp	xzr, tmp0, [x0]
 	stlxp	w4, in0, in1, [x0]
 	cbnz	w4, 1b
 	ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -126,22 +181,60 @@ ENTRY (libat_exchange_16_i1)
 	stxp	w4, in0, in1, [x5]
 	cbnz	w4, 3b
 	ret
-4:
-	cmp	w4, RELEASE
-	b.ne	6f
 
-	/* RELEASE.  */
-5:	ldxp	res0, res1, [x5]
+	/* RELEASE/ACQ_REL/SEQ_CST.  */
+4:	ldaxp	res0, res1, [x5]
 	stlxp	w4, in0, in1, [x5]
-	cbnz	w4, 5b
+	cbnz	w4, 4b
 	ret
+END (libat_exchange_16)
 
-	/* ACQ_REL/SEQ_CST.  */
-6:	ldaxp	res0, res1, [x5]
-	stlxp	w4, in0, in1, [x5]
-	cbnz	w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+	ldp	exp0, exp1, [x1]
+	cbz	w4, 3f
+	cmp	w4, RELEASE
+	b.hs	5f
+
+	/* ACQUIRE/CONSUME.  */
+1:	ldaxp	tmp0, tmp1, [x0]
+	cmp	tmp0, exp0
+	ccmp	tmp1, exp1, 0, eq
+	csel	tmp0, in0, tmp0, eq
+	csel	tmp1, in1, tmp1, eq
+	stxp	w4, tmp0, tmp1, [x0]
+	cbnz	w4, 1b
+	beq	2f
+	stp     tmp0, tmp1, [x1]
+2:	cset	x0, eq
+	ret
+
+	/* RELAXED.  */
+3:	ldxp	tmp0, tmp1, [x0]
+	cmp	tmp0, exp0
+	ccmp	tmp1, exp1, 0, eq
+	csel	tmp0, in0, tmp0, eq
+	csel	tmp1, in1, tmp1, eq
+	stxp	w4, tmp0, tmp1, [x0]
+	cbnz	w4, 3b
+	beq	4f
+	stp	tmp0, tmp1, [x1]
+4:	cset	x0, eq
+	ret
+
+	/* RELEASE/ACQ_REL/SEQ_CST.  */
+5:	ldaxp	tmp0, tmp1, [x0]
+	cmp	tmp0, exp0
+	ccmp	tmp1, exp1, 0, eq
+	csel	tmp0, in0, tmp0, eq
+	csel	tmp1, in1, tmp1, eq
+	stlxp	w4, tmp0, tmp1, [x0]
+	cbnz	w4, 5b
+	beq	6f
+	stp	tmp0, tmp1, [x1]
+6:	cset	x0, eq
 	ret
-END (libat_exchange_16_i1)
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +273,7 @@ ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -199,10 +292,10 @@ ENTRY (libat_fetch_add_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -221,10 +314,10 @@ ENTRY (libat_add_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -243,10 +336,10 @@ ENTRY (libat_fetch_sub_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -265,10 +358,10 @@ ENTRY (libat_sub_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -287,10 +380,10 @@ ENTRY (libat_fetch_or_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -309,10 +402,10 @@ ENTRY (libat_or_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -331,10 +424,10 @@ ENTRY (libat_fetch_and_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -353,10 +446,10 @@ ENTRY (libat_and_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -375,10 +468,10 @@ ENTRY (libat_fetch_xor_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -397,10 +490,10 @@ ENTRY (libat_xor_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
 	mov	x5, x0
 	mvn	in0, in0
 	mvn	in1, in1
@@ -421,10 +514,10 @@ ENTRY (libat_fetch_nand_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
 	mov	x5, x0
 	mvn	in0, in0
 	mvn	in1, in1
@@ -445,21 +538,38 @@ ENTRY (libat_nand_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-	mov	w2, 1
-	cbnz	w1, 2f
-
-	/* RELAXED.  */
-	swpb	w0, w2, [x0]
-	ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:	swpalb	w0, w2, [x0]
+	/* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+	mov	x5, x0
+1:	ldaxrb	w0, [x5]
+	stlxrb	w4, w2, [x5]
+	cbnz	w4, 1b
 	ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index 9747accd88f5881d0496f9f8104149e74bbbe268..265b6ebd82fd7cb7fd36554d4da82ef54b3b99b5 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,11 +35,20 @@
 #endif
 #define IFUNC_NCOND(N)	(1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16		1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16	1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16	1
 
 #ifdef HWCAP_USCAT
  
Richard Sandiford Dec. 4, 2023, 9:48 p.m. UTC | #9
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>>> Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
>>> existing binaries, gives better performance than locking atomics and is what
>>> most users expect.
>>
>> Please add a justification for why it's backwards compatible, rather
>> than just stating that it's so.
>
> This isn't any different than the LSE2 support which also switches some CPUs to
> lock-free implementations. This is basically switching the rest. It trivially follows
> from the fact that GCC always calls libatomic so that you switch all atomics in a
> process. I'll add that to the description.

So I guess there's an implicit assumption that we don't support people
linking libatomic statically into a DSO and hiding the symbols.  Obviously
not great practice, but I wouldn't be 100% surprised if someone's done
that...

> Note the compatibility story is even better than this. We are also compatible
> with LLVM and future GCC versions which may inline these sequences.
>
>> Thanks for adding this.  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95722
>> suggests that it's still an open question whether this is a correct thing
>> to do, but it sounds from Joseph's comment that he isn't sure whether
>> atomic loads from read-only data are valid.
>
> Yes it's not useful to do an atomic read if it is a read-only value... It should
> be feasible to mark atomic types as mutable to force them to .data (see eg.
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108659 and
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109553).
>
>> Linus's comment in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70490
>> suggests that a reasonable compromise might be to use a storing
>> implementation but not advertise that it is lock-free.  Also,
>> the comment above libat_is_lock_free says:
>>
>> /* Note that this can return that a size/alignment is not lock-free even if
>>    all the operations that we use to implement the respective accesses provide
>>    lock-free forward progress as specified in C++14:  Users likely expect
>>    "lock-free" to also mean "fast", which is why we do not return true if, for
>>    example, we implement loads with this size/alignment using a CAS.  */
>
> I don't believe lying about being lock-free like that is a good idea. When
> you use a faster lock-free implementation, you want to tell users about it
> (so they aren't forced to use nasty inline assembler hacks for example).
>
>> We don't use a CAS for the fallbacks, but like you say, we do use a
>> load/store exclusive loop.  So did you consider not doing this:
>
>> +/* State we have lock-free 128-bit atomics.  */
>> +#undef FAST_ATOMIC_LDST_16
>> +#define FAST_ATOMIC_LDST_16            1
>
> That would result in __atomic_is_lock_free incorrectly returning false.
> Note that __atomic_always_lock_free remains false for 128-bit since there
> is no inlining in the compiler, but __atomic_is_lock_free should be true.

I don't think it's so much lying/being incorrect as admitting that there
are shades of grey.  And the sources above suggest that there's no
consensus around saying that a load/store exclusive implementation of
an atomic load should advertise itself as lock-free.  x86 has:

/* Since load and store are implemented with CAS, they are not fast.  */
# undef FAST_ATOMIC_LDST_16
# define FAST_ATOMIC_LDST_16		0

I'm not comfortable with rejecting the opinions above based on my
limited knowledge of the area.

So the patch is OK from my POV without the FAST_ATOMIC_LDST_16
definition.  Please get approval from someone else if you want to
keep the definition, either as part of the initial patch or a separate
follow-on.

To be clear, I'd be happy with something that makes __atomic_is_lock_free
return true for LSE2, if that doesn't already happen (looks like it
doesn't happen?).

Thanks,
Richard

>> -       /* RELEASE.  */
>> -5:     ldxp    res0, res1, [x5]
>> +       /* RELEASE/ACQ_REL/SEQ_CST.  */
>> +4:     ldaxp   res0, res1, [x5]
>>          stlxp   w4, in0, in1, [x5]
>> -       cbnz    w4, 5b
>> +       cbnz    w4, 4b
>>          ret
>> +END (libat_exchange_16)
>
>> Please explain (here and in the commit message) why you're adding
>> acquire semantics to the RELEASE case.
>
> That merges the RELEASE with ACQ_REL/SEQ_CST cases to keep the code
> short and simple like much of the code. I've added a note in the commit msg.
>
> Cheers,
> Wilco
>
> Here is v2 - this also incorporates the PR111404 fix to compare-exchange:
>
> Enable lock-free 128-bit atomics on AArch64.  This is backwards compatible with
> existing binaries (as for these GCC always calls into libatomic, so all 128-bit
> atomic uses in  a process are switched), gives better performance than locking
> atomics and is what most users expect.
>
> Note 128-bit atomic loads use a load/store exclusive loop if LSE2 is not supported.
> This results in an implicit store which is invisible to software as long as the
> given address is writeable (which will be true when using atomics in actual code).
>
> Passes regress, OK for commit?
>
> libatomic/
>         config/linux/aarch64/atomic_16.S: Implement lock-free ARMv8.0 atomics.
>         (libat_exchange_16): Merge RELEASE and ACQ_REL/SEQ_CST cases.
>         config/linux/aarch64/host-config.h: Use atomic_16.S for baseline v8.0.
>         State we have lock-free atomics.
>
> ---
>
> diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
> index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..a099037179b3f1210145baea02a9d43418629813 100644
> --- a/libatomic/config/linux/aarch64/atomic_16.S
> +++ b/libatomic/config/linux/aarch64/atomic_16.S
> @@ -22,6 +22,22 @@
>     <http://www.gnu.org/licenses/>.  */
>
>
> +/* AArch64 128-bit lock-free atomic implementation.
> +
> +   128-bit atomics are now lock-free for all AArch64 architecture versions.
> +   This is backwards compatible with existing binaries (as we swap all uses
> +   of 128-bit atomics via an ifunc) and gives better performance than locking
> +   atomics.
> +
> +   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
> +   This results in an implicit store which is invisible to software as long
> +   as the given address is writeable.  Since all other atomics have explicit
> +   writes, this will be true when using atomics in actual code.
> +
> +   The libat_<op>_16 entry points are ARMv8.0.
> +   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
> +
> +
>         .arch   armv8-a+lse
>
>  #define ENTRY(name)            \
> @@ -37,6 +53,10 @@ name:                                \
>         .cfi_endproc;           \
>         .size name, .-name;
>
> +#define ALIAS(alias,name)      \
> +       .global alias;          \
> +       .set alias, name;
> +
>  #define res0 x0
>  #define res1 x1
>  #define in0  x2
> @@ -70,6 +90,24 @@ name:                                \
>  #define SEQ_CST 5
>
>
> +ENTRY (libat_load_16)
> +       mov     x5, x0
> +       cbnz    w1, 2f
> +
> +       /* RELAXED.  */
> +1:     ldxp    res0, res1, [x5]
> +       stxp    w4, res0, res1, [x5]
> +       cbnz    w4, 1b
> +       ret
> +
> +       /* ACQUIRE/CONSUME/SEQ_CST.  */
> +2:     ldaxp   res0, res1, [x5]
> +       stxp    w4, res0, res1, [x5]
> +       cbnz    w4, 2b
> +       ret
> +END (libat_load_16)
> +
> +
>  ENTRY (libat_load_16_i1)
>         cbnz    w1, 1f
>
> @@ -93,6 +131,23 @@ ENTRY (libat_load_16_i1)
>  END (libat_load_16_i1)
>
>
> +ENTRY (libat_store_16)
> +       cbnz    w4, 2f
> +
> +       /* RELAXED.  */
> +1:     ldxp    xzr, tmp0, [x0]
> +       stxp    w4, in0, in1, [x0]
> +       cbnz    w4, 1b
> +       ret
> +
> +       /* RELEASE/SEQ_CST.  */
> +2:     ldxp    xzr, tmp0, [x0]
> +       stlxp   w4, in0, in1, [x0]
> +       cbnz    w4, 2b
> +       ret
> +END (libat_store_16)
> +
> +
>  ENTRY (libat_store_16_i1)
>         cbnz    w4, 1f
>
> @@ -101,14 +156,14 @@ ENTRY (libat_store_16_i1)
>         ret
>
>         /* RELEASE/SEQ_CST.  */
> -1:     ldaxp   xzr, tmp0, [x0]
> +1:     ldxp    xzr, tmp0, [x0]
>         stlxp   w4, in0, in1, [x0]
>         cbnz    w4, 1b
>         ret
>  END (libat_store_16_i1)
>
>
> -ENTRY (libat_exchange_16_i1)
> +ENTRY (libat_exchange_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -126,22 +181,60 @@ ENTRY (libat_exchange_16_i1)
>         stxp    w4, in0, in1, [x5]
>         cbnz    w4, 3b
>         ret
> -4:
> -       cmp     w4, RELEASE
> -       b.ne    6f
>
> -       /* RELEASE.  */
> -5:     ldxp    res0, res1, [x5]
> +       /* RELEASE/ACQ_REL/SEQ_CST.  */
> +4:     ldaxp   res0, res1, [x5]
>         stlxp   w4, in0, in1, [x5]
> -       cbnz    w4, 5b
> +       cbnz    w4, 4b
>         ret
> +END (libat_exchange_16)
>
> -       /* ACQ_REL/SEQ_CST.  */
> -6:     ldaxp   res0, res1, [x5]
> -       stlxp   w4, in0, in1, [x5]
> -       cbnz    w4, 6b
> +
> +ENTRY (libat_compare_exchange_16)
> +       ldp     exp0, exp1, [x1]
> +       cbz     w4, 3f
> +       cmp     w4, RELEASE
> +       b.hs    5f
> +
> +       /* ACQUIRE/CONSUME.  */
> +1:     ldaxp   tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       csel    tmp0, in0, tmp0, eq
> +       csel    tmp1, in1, tmp1, eq
> +       stxp    w4, tmp0, tmp1, [x0]
> +       cbnz    w4, 1b
> +       beq     2f
> +       stp     tmp0, tmp1, [x1]
> +2:     cset    x0, eq
> +       ret
> +
> +       /* RELAXED.  */
> +3:     ldxp    tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       csel    tmp0, in0, tmp0, eq
> +       csel    tmp1, in1, tmp1, eq
> +       stxp    w4, tmp0, tmp1, [x0]
> +       cbnz    w4, 3b
> +       beq     4f
> +       stp     tmp0, tmp1, [x1]
> +4:     cset    x0, eq
> +       ret
> +
> +       /* RELEASE/ACQ_REL/SEQ_CST.  */
> +5:     ldaxp   tmp0, tmp1, [x0]
> +       cmp     tmp0, exp0
> +       ccmp    tmp1, exp1, 0, eq
> +       csel    tmp0, in0, tmp0, eq
> +       csel    tmp1, in1, tmp1, eq
> +       stlxp   w4, tmp0, tmp1, [x0]
> +       cbnz    w4, 5b
> +       beq     6f
> +       stp     tmp0, tmp1, [x1]
> +6:     cset    x0, eq
>         ret
> -END (libat_exchange_16_i1)
> +END (libat_compare_exchange_16)
>
>
>  ENTRY (libat_compare_exchange_16_i1)
> @@ -180,7 +273,7 @@ ENTRY (libat_compare_exchange_16_i1)
>  END (libat_compare_exchange_16_i1)
>
>
> -ENTRY (libat_fetch_add_16_i1)
> +ENTRY (libat_fetch_add_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -199,10 +292,10 @@ ENTRY (libat_fetch_add_16_i1)
>         stlxp   w4, tmp0, tmp1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_fetch_add_16_i1)
> +END (libat_fetch_add_16)
>
>
> -ENTRY (libat_add_fetch_16_i1)
> +ENTRY (libat_add_fetch_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -221,10 +314,10 @@ ENTRY (libat_add_fetch_16_i1)
>         stlxp   w4, res0, res1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_add_fetch_16_i1)
> +END (libat_add_fetch_16)
>
>
> -ENTRY (libat_fetch_sub_16_i1)
> +ENTRY (libat_fetch_sub_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -243,10 +336,10 @@ ENTRY (libat_fetch_sub_16_i1)
>         stlxp   w4, tmp0, tmp1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_fetch_sub_16_i1)
> +END (libat_fetch_sub_16)
>
>
> -ENTRY (libat_sub_fetch_16_i1)
> +ENTRY (libat_sub_fetch_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -265,10 +358,10 @@ ENTRY (libat_sub_fetch_16_i1)
>         stlxp   w4, res0, res1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_sub_fetch_16_i1)
> +END (libat_sub_fetch_16)
>
>
> -ENTRY (libat_fetch_or_16_i1)
> +ENTRY (libat_fetch_or_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -287,10 +380,10 @@ ENTRY (libat_fetch_or_16_i1)
>         stlxp   w4, tmp0, tmp1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_fetch_or_16_i1)
> +END (libat_fetch_or_16)
>
>
> -ENTRY (libat_or_fetch_16_i1)
> +ENTRY (libat_or_fetch_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -309,10 +402,10 @@ ENTRY (libat_or_fetch_16_i1)
>         stlxp   w4, res0, res1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_or_fetch_16_i1)
> +END (libat_or_fetch_16)
>
>
> -ENTRY (libat_fetch_and_16_i1)
> +ENTRY (libat_fetch_and_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -331,10 +424,10 @@ ENTRY (libat_fetch_and_16_i1)
>         stlxp   w4, tmp0, tmp1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_fetch_and_16_i1)
> +END (libat_fetch_and_16)
>
>
> -ENTRY (libat_and_fetch_16_i1)
> +ENTRY (libat_and_fetch_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -353,10 +446,10 @@ ENTRY (libat_and_fetch_16_i1)
>         stlxp   w4, res0, res1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_and_fetch_16_i1)
> +END (libat_and_fetch_16)
>
>
> -ENTRY (libat_fetch_xor_16_i1)
> +ENTRY (libat_fetch_xor_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -375,10 +468,10 @@ ENTRY (libat_fetch_xor_16_i1)
>         stlxp   w4, tmp0, tmp1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_fetch_xor_16_i1)
> +END (libat_fetch_xor_16)
>
>
> -ENTRY (libat_xor_fetch_16_i1)
> +ENTRY (libat_xor_fetch_16)
>         mov     x5, x0
>         cbnz    w4, 2f
>
> @@ -397,10 +490,10 @@ ENTRY (libat_xor_fetch_16_i1)
>         stlxp   w4, res0, res1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_xor_fetch_16_i1)
> +END (libat_xor_fetch_16)
>
>
> -ENTRY (libat_fetch_nand_16_i1)
> +ENTRY (libat_fetch_nand_16)
>         mov     x5, x0
>         mvn     in0, in0
>         mvn     in1, in1
> @@ -421,10 +514,10 @@ ENTRY (libat_fetch_nand_16_i1)
>         stlxp   w4, tmp0, tmp1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_fetch_nand_16_i1)
> +END (libat_fetch_nand_16)
>
>
> -ENTRY (libat_nand_fetch_16_i1)
> +ENTRY (libat_nand_fetch_16)
>         mov     x5, x0
>         mvn     in0, in0
>         mvn     in1, in1
> @@ -445,21 +538,38 @@ ENTRY (libat_nand_fetch_16_i1)
>         stlxp   w4, res0, res1, [x5]
>         cbnz    w4, 2b
>         ret
> -END (libat_nand_fetch_16_i1)
> +END (libat_nand_fetch_16)
>
>
> -ENTRY (libat_test_and_set_16_i1)
> -       mov     w2, 1
> -       cbnz    w1, 2f
> -
> -       /* RELAXED.  */
> -       swpb    w0, w2, [x0]
> -       ret
> +/* __atomic_test_and_set is always inlined, so this entry is unused and
> +   only required for completeness.  */
> +ENTRY (libat_test_and_set_16)
>
> -       /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
> -2:     swpalb  w0, w2, [x0]
> +       /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
> +       mov     x5, x0
> +1:     ldaxrb  w0, [x5]
> +       stlxrb  w4, w2, [x5]
> +       cbnz    w4, 1b
>         ret
> -END (libat_test_and_set_16_i1)
> +END (libat_test_and_set_16)
> +
> +
> +/* Alias entry points which are the same in baseline and LSE2.  */
> +
> +ALIAS (libat_exchange_16_i1, libat_exchange_16)
> +ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
> +ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
> +ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
> +ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
> +ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
> +ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
> +ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
> +ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
> +ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
> +ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
> +ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
> +ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
> +ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
>
>
>  /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
> diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
> index 9747accd88f5881d0496f9f8104149e74bbbe268..265b6ebd82fd7cb7fd36554d4da82ef54b3b99b5 100644
> --- a/libatomic/config/linux/aarch64/host-config.h
> +++ b/libatomic/config/linux/aarch64/host-config.h
> @@ -35,11 +35,20 @@
>  #endif
>  #define IFUNC_NCOND(N) (1)
>
> -#if N == 16 && IFUNC_ALT != 0
> +#endif /* HAVE_IFUNC */
> +
> +/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
> +#if N == 16
>  # define DONE 1
>  #endif
>
> -#endif /* HAVE_IFUNC */
> +/* State we have lock-free 128-bit atomics.  */
> +#undef FAST_ATOMIC_LDST_16
> +#define FAST_ATOMIC_LDST_16            1
> +#undef MAYBE_HAVE_ATOMIC_CAS_16
> +#define MAYBE_HAVE_ATOMIC_CAS_16       1
> +#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
> +#define MAYBE_HAVE_ATOMIC_EXCHANGE_16  1
>
>  #ifdef HWCAP_USCAT
  

Patch

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b9653c9bcb582761ff7aaa7c8f9643..0485c284117edf54f41959d2fab9341a9567b1cf 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,21 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 
+/* AArch64 128-bit lock-free atomic implementation.
+
+   128-bit atomics are now lock-free for all AArch64 architecture versions.
+   This is backwards compatible with existing binaries and gives better
+   performance than locking atomics.
+
+   128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+   This results in an implicit store which is invisible to software as long
+   as the given address is writeable.  Since all other atomics have explicit
+   writes, this will be true when using atomics in actual code.
+
+   The libat_<op>_16 entry points are ARMv8.0.
+   The libat_<op>_16_i1 entry points are used when LSE2 is available.  */
+
+
 	.arch	armv8-a+lse
 
 #define ENTRY(name)		\
@@ -37,6 +52,10 @@  name:				\
 	.cfi_endproc;		\
 	.size name, .-name;
 
+#define ALIAS(alias,name)	\
+	.global alias;		\
+	.set alias, name;
+
 #define res0 x0
 #define res1 x1
 #define in0  x2
@@ -70,6 +89,24 @@  name:				\
 #define SEQ_CST 5
 
 
+ENTRY (libat_load_16)
+	mov	x5, x0
+	cbnz	w1, 2f
+
+	/* RELAXED.  */
+1:	ldxp	res0, res1, [x5]
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+
+	/* ACQUIRE/CONSUME/SEQ_CST.  */
+2:	ldaxp	res0, res1, [x5]
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (libat_load_16)
+
+
 ENTRY (libat_load_16_i1)
 	cbnz	w1, 1f
 
@@ -93,6 +130,23 @@  ENTRY (libat_load_16_i1)
 END (libat_load_16_i1)
 
 
+ENTRY (libat_store_16)
+	cbnz	w4, 2f
+
+	/* RELAXED.  */
+1:	ldxp	xzr, tmp0, [x0]
+	stxp	w4, in0, in1, [x0]
+	cbnz	w4, 1b
+	ret
+
+	/* RELEASE/SEQ_CST.  */
+2:	ldxp	xzr, tmp0, [x0]
+	stlxp	w4, in0, in1, [x0]
+	cbnz	w4, 2b
+	ret
+END (libat_store_16)
+
+
 ENTRY (libat_store_16_i1)
 	cbnz	w4, 1f
 
@@ -101,14 +155,14 @@  ENTRY (libat_store_16_i1)
 	ret
 
 	/* RELEASE/SEQ_CST.  */
-1:	ldaxp	xzr, tmp0, [x0]
+1:	ldxp	xzr, tmp0, [x0]
 	stlxp	w4, in0, in1, [x0]
 	cbnz	w4, 1b
 	ret
 END (libat_store_16_i1)
 
 
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -126,22 +180,55 @@  ENTRY (libat_exchange_16_i1)
 	stxp	w4, in0, in1, [x5]
 	cbnz	w4, 3b
 	ret
-4:
-	cmp	w4, RELEASE
-	b.ne	6f
 
-	/* RELEASE.  */
-5:	ldxp	res0, res1, [x5]
+	/* RELEASE/ACQ_REL/SEQ_CST.  */
+4:	ldaxp	res0, res1, [x5]
 	stlxp	w4, in0, in1, [x5]
-	cbnz	w4, 5b
+	cbnz	w4, 4b
 	ret
+END (libat_exchange_16)
 
-	/* ACQ_REL/SEQ_CST.  */
-6:	ldaxp	res0, res1, [x5]
-	stlxp	w4, in0, in1, [x5]
-	cbnz	w4, 6b
+
+ENTRY (libat_compare_exchange_16)
+	ldp	exp0, exp1, [x1]
+	cbz	w4, 3f
+	cmp	w4, RELEASE
+	b.hs	4f
+
+	/* ACQUIRE/CONSUME.  */
+1:	ldaxp	tmp0, tmp1, [x0]
+	cmp	tmp0, exp0
+	ccmp	tmp1, exp1, 0, eq
+	bne	2f
+	stxp	w4, in0, in1, [x0]
+	cbnz	w4, 1b
+	mov	x0, 1
 	ret
-END (libat_exchange_16_i1)
+
+2:	stp	tmp0, tmp1, [x1]
+	mov	x0, 0
+	ret
+
+	/* RELAXED.  */
+3:	ldxp	tmp0, tmp1, [x0]
+	cmp	tmp0, exp0
+	ccmp	tmp1, exp1, 0, eq
+	bne	2b
+	stxp	w4, in0, in1, [x0]
+	cbnz	w4, 3b
+	mov	x0, 1
+	ret
+
+	/* RELEASE/ACQ_REL/SEQ_CST.  */
+4:	ldaxp	tmp0, tmp1, [x0]
+	cmp	tmp0, exp0
+	ccmp	tmp1, exp1, 0, eq
+	bne	2b
+	stlxp	w4, in0, in1, [x0]
+	cbnz	w4, 4b
+	mov	x0, 1
+	ret
+END (libat_compare_exchange_16)
 
 
 ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +267,7 @@  ENTRY (libat_compare_exchange_16_i1)
 END (libat_compare_exchange_16_i1)
 
 
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -199,10 +286,10 @@  ENTRY (libat_fetch_add_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
 
 
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -221,10 +308,10 @@  ENTRY (libat_add_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
 
 
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -243,10 +330,10 @@  ENTRY (libat_fetch_sub_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
 
 
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -265,10 +352,10 @@  ENTRY (libat_sub_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
 
 
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -287,10 +374,10 @@  ENTRY (libat_fetch_or_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
 
 
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -309,10 +396,10 @@  ENTRY (libat_or_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
 
 
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -331,10 +418,10 @@  ENTRY (libat_fetch_and_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
 
 
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -353,10 +440,10 @@  ENTRY (libat_and_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
 
 
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -375,10 +462,10 @@  ENTRY (libat_fetch_xor_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
 
 
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
@@ -397,10 +484,10 @@  ENTRY (libat_xor_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
 
 
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
 	mov	x5, x0
 	mvn	in0, in0
 	mvn	in1, in1
@@ -421,10 +508,10 @@  ENTRY (libat_fetch_nand_16_i1)
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
 
 
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
 	mov	x5, x0
 	mvn	in0, in0
 	mvn	in1, in1
@@ -445,21 +532,38 @@  ENTRY (libat_nand_fetch_16_i1)
 	stlxp	w4, res0, res1, [x5]
 	cbnz	w4, 2b
 	ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
 
 
-ENTRY (libat_test_and_set_16_i1)
-	mov	w2, 1
-	cbnz	w1, 2f
-
-	/* RELAXED.  */
-	swpb	w0, w2, [x0]
-	ret
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY (libat_test_and_set_16)
 
-	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:	swpalb	w0, w2, [x0]
+	/* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+	mov	x5, x0
+1:	ldaxrb	w0, [x5]
+	stlxrb	w4, w2, [x5]
+	cbnz	w4, 1b
 	ret
-END (libat_test_and_set_16_i1)
+END (libat_test_and_set_16)
+
+
+/* Alias entry points which are the same in baseline and LSE2.  */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index bea26825b4f75bb8ff348ab4b5fc45f4a5bd561e..851c78c01cd643318aaa52929ce4550266238b79 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,10 +35,19 @@ 
 #endif
 #define IFUNC_NCOND(N)	(1)
 
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S.  */
+#if N == 16
 # define DONE 1
 #endif
 
-#endif /* HAVE_IFUNC */
+/* State we have lock-free 128-bit atomics.  */
+#undef FAST_ATOMIC_LDST_16
+#define FAST_ATOMIC_LDST_16		1
+#undef MAYBE_HAVE_ATOMIC_CAS_16
+#define MAYBE_HAVE_ATOMIC_CAS_16	1
+#undef MAYBE_HAVE_ATOMIC_EXCHANGE_16
+#define MAYBE_HAVE_ATOMIC_EXCHANGE_16	1
 
 #include_next <host-config.h>