aarch64: Optimized implementation of pthread_spin_lock and unlock
Commit Message
An optimized algorithm of spin_lock is implemented to wait for
random time in the case that multi-threads want to store spinlock
at the same time. This way can make the operation of different
threads asynchronous, thereby reducing bus conflicts, and futherly
improving the overall performance, which benefits more on aarch64
for its multi-core features.
In addition, the assembly version of spin_unlock is also implemented.
---
sysdeps/aarch64/nptl/pthread_spin_lock.S | 103 +++++++++++++++++++++++++++++
sysdeps/aarch64/nptl/pthread_spin_unlock.S | 34 ++++++++++
2 files changed, 137 insertions(+)
create mode 100644 sysdeps/aarch64/nptl/pthread_spin_lock.S
create mode 100644 sysdeps/aarch64/nptl/pthread_spin_unlock.S
Comments
On 28/10/2019 13:39, Xuelei Zhang wrote:
> An optimized algorithm of spin_lock is implemented to wait for
> random time in the case that multi-threads want to store spinlock
> at the same time. This way can make the operation of different
> threads asynchronous, thereby reducing bus conflicts, and futherly
> improving the overall performance, which benefits more on aarch64
> for its multi-core features.
>
> In addition, the assembly version of spin_unlock is also implemented.
this has many issues
spinlock benchmarking is non-trivial (workload dependent,
uarch dependent and memory system dependent) and you need
to demonstrate that the new code is significantly better
than the generic c code, otherwise we don't want to
maintain asm in the aarch64 port.
the generic c code may gain improvements that outperform
whatever asm implementation (e.g. numa aware spinlock)
and allows compiling the code to different targets
(e.g. armv8.1-a+lse).
the arm arm has a spin lock implementation in the "barrier
litmus tests" section using wfe and sevl, if i had to add
asm i'd probably start with that.
> +ENTRY (pthread_spin_lock)
> + DELOUSE (0)
> +
> + mov w1, #0x1
> +
> +L(spin):
> + prfm pldl1strm, [x0]
> + /* A count to distinguish wating time.
> + The larger the value, the more intense the current lock
> + conflicts, and the longer the waiting time. */
> + add w3, w3, #0x1
w3 is not initialized, i would not call that "wait for
random time", since the lock contention is most likely
between the exact same code path executing in different
threads with identical w3 at function entry.
> + ldaxr w2, [x0]
> + cbnz w2, L(spin)
> + stxr w2, w1, [x0]
> + cbnz w2, 1f
> + b L(end)
> +
> +L(end):
> + mov w0, #0x0
> + ret
> +
> + /* Set the loop times of L(wait) from 1000 to 7000 cycles,
> + equals waiting 1~2us per 1000 cycles. */
> + .p2align 4
> +7:
> + mov w6, #0x1b58
> + b L(wait_init)
> +6:
> + mov w6, #0x1770
> + b L(wait_init)
> +5:
> + mov w6, #0x1388
> + b L(wait_init)
> +4:
> + mov w6, #0x0fa0
> + b L(wait_init)
> +3:
> + mov w6, #0x0bb8
> + b L(wait_init)
> +2:
> + mov w6, #0x07d0
> + b L(wait_init)
> +1:
> + mov w6, #0x03e8
> + b L(wait_init)
> +
> +L(wait_init):
> + mov w5, #0x0
> +L(wait):
> + add w5, w5, #0x1
> + cmp w5, w6
> + b.lt L(wait) /* Wait ends when w5 equals w6. */
> +
> +L(stxr_try):
> + ldr w2, [x0]
> + cbz w2, L(spin)
> + and w3, w3, #0x07
> + /* 8 kinds of distinguish wating time
> + based on the lower three bits of w3. */
> + cmp w3, #0x01
> + beq 1b
> + cmp w3, #0x02
> + beq 2b
> + cmp w3, #0x03
> + beq 3b
> + cmp w3, #0x04
> + beq 4b
> + cmp w3, #0x05
> + beq 5b
> + cmp w3, #0x06
> + beq 6b
> + cmp w3, #0x07
> + beq 7b
> +
> + wfe /* w3=0x000: the most intense situation so to wait 50 us */
> + b L(stxr_try)
> +
> +
> +END (pthread_spin_lock)
> +libc_hidden_builtin_def (pthread_spin_lock)
> +weak_alias (pthread_spin_lock, index)
that weak_alias is wrong.
> +ENTRY (pthread_spin_unlock)
> + DELOUSE (0)
> + stlr wzr, [x0]
note: current glibc code would generate
dmb ish
str wzr, [x0]
instead of stlr, which is faster on some cores,
but has weaker ordering guarantees.
> + mov w0, #0x0
> + ret
> +
> +END (pthread_spin_unlock)
> +libc_hidden_builtin_def (pthread_spin_unlock)
> +weak_alias (pthread_spin_unlock, index)
this weak_alias is wrong too.
On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote:
>
> An optimized algorithm of spin_lock is implemented to wait for
> random time in the case that multi-threads want to store spinlock
> at the same time. This way can make the operation of different
> threads asynchronous, thereby reducing bus conflicts, and futherly
> improving the overall performance, which benefits more on aarch64
> for its multi-core features.
>
> In addition, the assembly version of spin_unlock is also implemented.
> ---
> sysdeps/aarch64/nptl/pthread_spin_lock.S | 103 +++++++++++++++++++++++++++++
> sysdeps/aarch64/nptl/pthread_spin_unlock.S | 34 ++++++++++
> 2 files changed, 137 insertions(+)
> create mode 100644 sysdeps/aarch64/nptl/pthread_spin_lock.S
> create mode 100644 sysdeps/aarch64/nptl/pthread_spin_unlock.S
>
> diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.S b/sysdeps/aarch64/nptl/pthread_spin_lock.S
> new file mode 100644
> index 00000000000..662410c5fa3
> --- /dev/null
> +++ b/sysdeps/aarch64/nptl/pthread_spin_lock.S
> @@ -0,0 +1,103 @@
> +/* pthread_spin_lock -- lock a spin lock. Generic version.
> + Copyright (C) 2012-2019 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64
> + */
> +
> +ENTRY (pthread_spin_lock)
> + DELOUSE (0)
> +
> + mov w1, #0x1
> +
> +L(spin):
> + prfm pldl1strm, [x0]
> + /* A count to distinguish wating time.
> + The larger the value, the more intense the current lock
> + conflicts, and the longer the waiting time. */
> + add w3, w3, #0x1
> + ldaxr w2, [x0]
> + cbnz w2, L(spin)
> + stxr w2, w1, [x0]
> + cbnz w2, 1f
> + b L(end)
> +
> +L(end):
> + mov w0, #0x0
> + ret
> +
> + /* Set the loop times of L(wait) from 1000 to 7000 cycles,
> + equals waiting 1~2us per 1000 cycles. */
> + .p2align 4
> +7:
> + mov w6, #0x1b58
> + b L(wait_init)
> +6:
> + mov w6, #0x1770
> + b L(wait_init)
> +5:
> + mov w6, #0x1388
> + b L(wait_init)
> +4:
> + mov w6, #0x0fa0
> + b L(wait_init)
> +3:
> + mov w6, #0x0bb8
> + b L(wait_init)
> +2:
> + mov w6, #0x07d0
> + b L(wait_init)
> +1:
> + mov w6, #0x03e8
> + b L(wait_init)
> +
> +L(wait_init):
> + mov w5, #0x0
> +L(wait):
> + add w5, w5, #0x1
> + cmp w5, w6
> + b.lt L(wait) /* Wait ends when w5 equals w6. */
I could see this loop even taking less than one cycle/iteration. With
a micro-op loop cache.
But really my bet is r3 will 99% be 0 so it won't be that random after all.
If this had the ability to become a ticket lock, then you could use
the difference between the ticket and the next ticket for the loop
time. But since the ABI is set in stone (that is the unlocked value
needs to be all zeros), it does not have the ability to become a
ticket spin lock.
Maybe one way is to use the lower bits of the thread ID. I don't know
if that would be fast enough to get though. Maybe you could do
popcount on an TLS address; though that takes a few cycles due to
using the vector registers (no scalar popcount instruction).
Thanks,
Andrew Pinski
> +
> +L(stxr_try):
> + ldr w2, [x0]
> + cbz w2, L(spin)
> + and w3, w3, #0x07
> + /* 8 kinds of distinguish wating time
> + based on the lower three bits of w3. */
> + cmp w3, #0x01
> + beq 1b
> + cmp w3, #0x02
> + beq 2b
> + cmp w3, #0x03
> + beq 3b
> + cmp w3, #0x04
> + beq 4b
> + cmp w3, #0x05
> + beq 5b
> + cmp w3, #0x06
> + beq 6b
> + cmp w3, #0x07
> + beq 7b
> +
> + wfe /* w3=0x000: the most intense situation so to wait 50 us */
> + b L(stxr_try)
> +
> +
> +END (pthread_spin_lock)
> +libc_hidden_builtin_def (pthread_spin_lock)
> +weak_alias (pthread_spin_lock, index)
> diff --git a/sysdeps/aarch64/nptl/pthread_spin_unlock.S b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
> new file mode 100644
> index 00000000000..57a6bca86fe
> --- /dev/null
> +++ b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
> @@ -0,0 +1,34 @@
> +/* pthread_spin_unlock -- unlock a spin lock. Generic version.
> + Copyright (C) 2012-2019 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64
> + */
> +
> +ENTRY (pthread_spin_unlock)
> + DELOUSE (0)
> + stlr wzr, [x0]
> + mov w0, #0x0
> + ret
> +
> +END (pthread_spin_unlock)
> +libc_hidden_builtin_def (pthread_spin_unlock)
> +weak_alias (pthread_spin_unlock, index)
> --
> 2.14.1.windows.1
>
>
On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote:
>
> An optimized algorithm of spin_lock is implemented to wait for
> random time in the case that multi-threads want to store spinlock
> at the same time. This way can make the operation of different
> threads asynchronous, thereby reducing bus conflicts, and futherly
> improving the overall performance, which benefits more on aarch64
> for its multi-core features.
>
> In addition, the assembly version of spin_unlock is also implemented.
> ---
> sysdeps/aarch64/nptl/pthread_spin_lock.S | 103 +++++++++++++++++++++++++++++
> sysdeps/aarch64/nptl/pthread_spin_unlock.S | 34 ++++++++++
> 2 files changed, 137 insertions(+)
> create mode 100644 sysdeps/aarch64/nptl/pthread_spin_lock.S
> create mode 100644 sysdeps/aarch64/nptl/pthread_spin_unlock.S
>
> diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.S b/sysdeps/aarch64/nptl/pthread_spin_lock.S
> new file mode 100644
> index 00000000000..662410c5fa3
> --- /dev/null
> +++ b/sysdeps/aarch64/nptl/pthread_spin_lock.S
> @@ -0,0 +1,103 @@
> +/* pthread_spin_lock -- lock a spin lock. Generic version.
> + Copyright (C) 2012-2019 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64
> + */
> +
> +ENTRY (pthread_spin_lock)
> + DELOUSE (0)
> +
> + mov w1, #0x1
> +
> +L(spin):
> + prfm pldl1strm, [x0]
> + /* A count to distinguish wating time.
> + The larger the value, the more intense the current lock
> + conflicts, and the longer the waiting time. */
> + add w3, w3, #0x1
> + ldaxr w2, [x0]
> + cbnz w2, L(spin)
> + stxr w2, w1, [x0]
> + cbnz w2, 1f
> + b L(end)
> +
> +L(end):
> + mov w0, #0x0
> + ret
> +
> + /* Set the loop times of L(wait) from 1000 to 7000 cycles,
> + equals waiting 1~2us per 1000 cycles. */
> + .p2align 4
> +7:
> + mov w6, #0x1b58
> + b L(wait_init)
> +6:
> + mov w6, #0x1770
> + b L(wait_init)
> +5:
> + mov w6, #0x1388
> + b L(wait_init)
> +4:
> + mov w6, #0x0fa0
> + b L(wait_init)
> +3:
> + mov w6, #0x0bb8
> + b L(wait_init)
> +2:
> + mov w6, #0x07d0
> + b L(wait_init)
> +1:
> + mov w6, #0x03e8
> + b L(wait_init)
> +
> +L(wait_init):
> + mov w5, #0x0
> +L(wait):
> + add w5, w5, #0x1
> + cmp w5, w6
> + b.lt L(wait) /* Wait ends when w5 equals w6. */
> +
> +L(stxr_try):
> + ldr w2, [x0]
> + cbz w2, L(spin)
> + and w3, w3, #0x07
> + /* 8 kinds of distinguish wating time
> + based on the lower three bits of w3. */
> + cmp w3, #0x01
> + beq 1b
> + cmp w3, #0x02
> + beq 2b
> + cmp w3, #0x03
> + beq 3b
> + cmp w3, #0x04
> + beq 4b
> + cmp w3, #0x05
> + beq 5b
> + cmp w3, #0x06
> + beq 6b
> + cmp w3, #0x07
> + beq 7b
> +
> + wfe /* w3=0x000: the most intense situation so to wait 50 us */
> + b L(stxr_try)
> +
> +
> +END (pthread_spin_lock)
> +libc_hidden_builtin_def (pthread_spin_lock)
> +weak_alias (pthread_spin_lock, index)
> diff --git a/sysdeps/aarch64/nptl/pthread_spin_unlock.S b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
> new file mode 100644
> index 00000000000..57a6bca86fe
> --- /dev/null
> +++ b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
> @@ -0,0 +1,34 @@
> +/* pthread_spin_unlock -- unlock a spin lock. Generic version.
> + Copyright (C) 2012-2019 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64
> + */
> +
> +ENTRY (pthread_spin_unlock)
> + DELOUSE (0)
> + stlr wzr, [x0]
> + mov w0, #0x0
Actually the C version of pthread_spin_unlock is implemented this way
already and provides exactly this same assembly.
Why do you need an assembly version?
Thanks,
Andrew Pinski
PS for reference:
./nptl/pthread_spin_unlock.o: file format elf64-littleaarch64
Disassembly of section .text:
0000000000000000 <pthread_spin_unlock>:
0: 889ffc1f stlr wzr, [x0]
4: 52800000 mov w0, #0x0 // #0
8: d65f03c0 ret
> + ret
> +
> +END (pthread_spin_unlock)
> +libc_hidden_builtin_def (pthread_spin_unlock)
> +weak_alias (pthread_spin_unlock, index)
> --
> 2.14.1.windows.1
>
>
On 28/10/2019 17:53, Andrew Pinski wrote:
> On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote:
>> +ENTRY (pthread_spin_unlock)
>> + DELOUSE (0)
>> + stlr wzr, [x0]
>> + mov w0, #0x0
>
> Actually the C version of pthread_spin_unlock is implemented this way
> already and provides exactly this same assembly.
> Why do you need an assembly version?
if lock is in asm, then unlock must be in asm too.
we don't want to reason about synchronization
between asm and c code as their memory model is
defined differently and we don't want to rely
on a particular implementation in the compiler.
and the generic c code may change independently
(e.g. spin_unlock used to be barrier + store
but changed at some point, such change can easily
break the synchronization without us noticing)
i'd prefer to improve the generic c code if
possible instead of using asm.
Hi,
> i'd prefer to improve the generic c code if
> possible instead of using asm.
Agreed, there isn't really anything performance critical that assembler
could speed up. It's more about what strategy to use if there is any
contention.
But we really need representative benchmarks first. Without that it's
impossible to decide whether a new spinlock implementation is faster.
Cheers,
Wilco
new file mode 100644
@@ -0,0 +1,103 @@
+/* pthread_spin_lock -- lock a spin lock. Generic version.
+ Copyright (C) 2012-2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ENTRY (pthread_spin_lock)
+ DELOUSE (0)
+
+ mov w1, #0x1
+
+L(spin):
+ prfm pldl1strm, [x0]
+ /* A count to distinguish wating time.
+ The larger the value, the more intense the current lock
+ conflicts, and the longer the waiting time. */
+ add w3, w3, #0x1
+ ldaxr w2, [x0]
+ cbnz w2, L(spin)
+ stxr w2, w1, [x0]
+ cbnz w2, 1f
+ b L(end)
+
+L(end):
+ mov w0, #0x0
+ ret
+
+ /* Set the loop times of L(wait) from 1000 to 7000 cycles,
+ equals waiting 1~2us per 1000 cycles. */
+ .p2align 4
+7:
+ mov w6, #0x1b58
+ b L(wait_init)
+6:
+ mov w6, #0x1770
+ b L(wait_init)
+5:
+ mov w6, #0x1388
+ b L(wait_init)
+4:
+ mov w6, #0x0fa0
+ b L(wait_init)
+3:
+ mov w6, #0x0bb8
+ b L(wait_init)
+2:
+ mov w6, #0x07d0
+ b L(wait_init)
+1:
+ mov w6, #0x03e8
+ b L(wait_init)
+
+L(wait_init):
+ mov w5, #0x0
+L(wait):
+ add w5, w5, #0x1
+ cmp w5, w6
+ b.lt L(wait) /* Wait ends when w5 equals w6. */
+
+L(stxr_try):
+ ldr w2, [x0]
+ cbz w2, L(spin)
+ and w3, w3, #0x07
+ /* 8 kinds of distinguish wating time
+ based on the lower three bits of w3. */
+ cmp w3, #0x01
+ beq 1b
+ cmp w3, #0x02
+ beq 2b
+ cmp w3, #0x03
+ beq 3b
+ cmp w3, #0x04
+ beq 4b
+ cmp w3, #0x05
+ beq 5b
+ cmp w3, #0x06
+ beq 6b
+ cmp w3, #0x07
+ beq 7b
+
+ wfe /* w3=0x000: the most intense situation so to wait 50 us */
+ b L(stxr_try)
+
+
+END (pthread_spin_lock)
+libc_hidden_builtin_def (pthread_spin_lock)
+weak_alias (pthread_spin_lock, index)
new file mode 100644
@@ -0,0 +1,34 @@
+/* pthread_spin_unlock -- unlock a spin lock. Generic version.
+ Copyright (C) 2012-2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ENTRY (pthread_spin_unlock)
+ DELOUSE (0)
+ stlr wzr, [x0]
+ mov w0, #0x0
+ ret
+
+END (pthread_spin_unlock)
+libc_hidden_builtin_def (pthread_spin_unlock)
+weak_alias (pthread_spin_unlock, index)