CAS instruction is expensive. From the x86 CPU's point of view, getting
a cache line for writing is more expensive than reading. See Appendix
A.2 Spinlock in:
https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
The full compare and swap will grab the cache line exclusive and cause
excessive cache line bouncing.
1. Change low level locks to do an atomic load and skip CAS if compare
may fail to reduce cache line bouncing on contended locks.
2. In __lll_lock, replace atomic_compare_and_exchange_bool_acq with
atomic_compare_and_exchange_val_acq and pass down the result to
__lll_lock_wait and __lll_lock_wait_private to avoid the redundant load
there.
---
nptl/lowlevellock.c | 12 ++++++------
sysdeps/nptl/lowlevellock.h | 33 +++++++++++++++++++++++----------
2 files changed, 29 insertions(+), 16 deletions(-)
@@ -22,30 +22,30 @@
#include <stap-probe.h>
void
-__lll_lock_wait_private (int *futex)
+__lll_lock_wait_private (int *futex, int futex_value)
{
- if (atomic_load_relaxed (futex) == 2)
+ if (futex_value == 2)
goto futex;
while (atomic_exchange_acquire (futex, 2) != 0)
{
futex:
- LIBC_PROBE (lll_lock_wait_private, 1, futex);
+ LIBC_PROBE (lll_lock_wait_private, 2, futex, futex_value);
futex_wait ((unsigned int *) futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */
}
}
libc_hidden_def (__lll_lock_wait_private)
void
-__lll_lock_wait (int *futex, int private)
+__lll_lock_wait (int *futex, int futex_value, int private)
{
- if (atomic_load_relaxed (futex) == 2)
+ if (futex_value == 2)
goto futex;
while (atomic_exchange_acquire (futex, 2) != 0)
{
futex:
- LIBC_PROBE (lll_lock_wait, 1, futex);
+ LIBC_PROBE (lll_lock_wait, 2, futex, futex_value);
futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2. */
}
}
@@ -66,7 +66,12 @@
0. Otherwise leave lock unchanged and return non-zero to indicate that the
lock was not acquired. */
#define __lll_trylock(lock) \
- __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), 1, 0))
+ (__extension__ ({ \
+ __typeof (*(lock)) __lock_value = atomic_load_relaxed (lock); \
+ (__lock_value != 0 \
+ || __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), \
+ 1, 0)));\
+ }))
#define lll_trylock(lock) \
__lll_trylock (&(lock))
@@ -74,11 +79,16 @@
return 0. Otherwise leave lock unchanged and return non-zero to indicate
that the lock was not acquired. */
#define lll_cond_trylock(lock) \
- __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))
-
-extern void __lll_lock_wait_private (int *futex);
+ (__extension__ ({ \
+ __typeof (lock) __lock_value = atomic_load_relaxed (&(lock)); \
+ (__lock_value != 0 \
+ || __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock),\
+ 2, 0)));\
+ }))
+
+extern void __lll_lock_wait_private (int *futex, int futex_value);
libc_hidden_proto (__lll_lock_wait_private)
-extern void __lll_lock_wait (int *futex, int private);
+extern void __lll_lock_wait (int *futex, int futex_value, int private);
libc_hidden_proto (__lll_lock_wait)
/* This is an expression rather than a statement even though its value is
@@ -95,13 +105,15 @@ libc_hidden_proto (__lll_lock_wait)
((void) \
({ \
int *__futex = (futex); \
- if (__glibc_unlikely \
- (atomic_compare_and_exchange_bool_acq (__futex, 1, 0))) \
+ int __futex_value = atomic_load_relaxed (futex); \
+ if (__futex_value != 0 \
+ || ((__futex_value = atomic_compare_and_exchange_val_acq \
+ (__futex, 1, 0) != 0))) \
{ \
if (__builtin_constant_p (private) && (private) == LLL_PRIVATE) \
- __lll_lock_wait_private (__futex); \
+ __lll_lock_wait_private (futex, __futex_value); \
else \
- __lll_lock_wait (__futex, private); \
+ __lll_lock_wait (futex, __futex_value, private); \
} \
}))
#define lll_lock(futex, private) \
@@ -120,7 +132,8 @@ libc_hidden_proto (__lll_lock_wait)
({ \
int *__futex = (futex); \
if (__glibc_unlikely (atomic_exchange_acq (__futex, 2) != 0)) \
- __lll_lock_wait (__futex, private); \
+ __lll_lock_wait (__futex, atomic_load_relaxed (__futex), \
+ private); \
}))
#define lll_cond_lock(futex, private) __lll_cond_lock (&(futex), private)