diff mbox series

[v5,1/3] Reduce CAS in low level locks [BZ #28537]

Message ID 20211110184153.2269857-2-hjl.tools@gmail.com
State Superseded
Headers show
Series Optimize CAS [BZ #28537] | expand

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

H.J. Lu Nov. 10, 2021, 6:41 p.m. UTC
CAS instruction is expensive.  From the x86 CPU's point of view, getting
a cache line for writing is more expensive than reading.  See Appendix
A.2 Spinlock in:

https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf

The full compare and swap will grab the cache line exclusive and cause
excessive cache line bouncing.

1. Change low level locks to do an atomic load and skip CAS if compare
may fail to reduce cache line bouncing on contended locks.
2. In __lll_lock, replace atomic_compare_and_exchange_bool_acq with
atomic_compare_and_exchange_val_acq and pass down the result to
__lll_lock_wait and __lll_lock_wait_private to avoid the redundant load
there.
---
 nptl/lowlevellock.c         | 12 ++++++------
 sysdeps/nptl/lowlevellock.h | 33 +++++++++++++++++++++++----------
 2 files changed, 29 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c
index 8c740529c4..d1965c01ca 100644
--- a/nptl/lowlevellock.c
+++ b/nptl/lowlevellock.c
@@ -22,30 +22,30 @@ 
 #include <stap-probe.h>
 
 void
-__lll_lock_wait_private (int *futex)
+__lll_lock_wait_private (int *futex, int futex_value)
 {
-  if (atomic_load_relaxed (futex) == 2)
+  if (futex_value == 2)
     goto futex;
 
   while (atomic_exchange_acquire (futex, 2) != 0)
     {
     futex:
-      LIBC_PROBE (lll_lock_wait_private, 1, futex);
+      LIBC_PROBE (lll_lock_wait_private, 2, futex, futex_value);
       futex_wait ((unsigned int *) futex, 2, LLL_PRIVATE); /* Wait if *futex == 2.  */
     }
 }
 libc_hidden_def (__lll_lock_wait_private)
 
 void
-__lll_lock_wait (int *futex, int private)
+__lll_lock_wait (int *futex, int futex_value, int private)
 {
-  if (atomic_load_relaxed (futex) == 2)
+  if (futex_value == 2)
     goto futex;
 
   while (atomic_exchange_acquire (futex, 2) != 0)
     {
     futex:
-      LIBC_PROBE (lll_lock_wait, 1, futex);
+      LIBC_PROBE (lll_lock_wait, 2, futex, futex_value);
       futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2.  */
     }
 }
diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h
index 4d95114ed3..05260eb706 100644
--- a/sysdeps/nptl/lowlevellock.h
+++ b/sysdeps/nptl/lowlevellock.h
@@ -66,7 +66,12 @@ 
    0.  Otherwise leave lock unchanged and return non-zero to indicate that the
    lock was not acquired.  */
 #define __lll_trylock(lock)	\
-  __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), 1, 0))
+  (__extension__ ({							\
+    __typeof (*(lock)) __lock_value = atomic_load_relaxed (lock);	\
+    (__lock_value != 0							\
+     || __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock),	\
+								1, 0)));\
+  }))
 #define lll_trylock(lock)	\
    __lll_trylock (&(lock))
 
@@ -74,11 +79,16 @@ 
    return 0.  Otherwise leave lock unchanged and return non-zero to indicate
    that the lock was not acquired.  */
 #define lll_cond_trylock(lock)	\
-  __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0))
-
-extern void __lll_lock_wait_private (int *futex);
+  (__extension__ ({							\
+    __typeof (lock) __lock_value = atomic_load_relaxed (&(lock));	\
+    (__lock_value != 0							\
+     || __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock),\
+								2, 0)));\
+  }))
+
+extern void __lll_lock_wait_private (int *futex, int futex_value);
 libc_hidden_proto (__lll_lock_wait_private)
-extern void __lll_lock_wait (int *futex, int private);
+extern void __lll_lock_wait (int *futex, int futex_value, int private);
 libc_hidden_proto (__lll_lock_wait)
 
 /* This is an expression rather than a statement even though its value is
@@ -95,13 +105,15 @@  libc_hidden_proto (__lll_lock_wait)
   ((void)                                                               \
    ({                                                                   \
      int *__futex = (futex);                                            \
-     if (__glibc_unlikely                                               \
-         (atomic_compare_and_exchange_bool_acq (__futex, 1, 0)))        \
+     int __futex_value = atomic_load_relaxed (futex);                   \
+     if (__futex_value != 0                                             \
+         || ((__futex_value = atomic_compare_and_exchange_val_acq       \
+              (__futex, 1, 0) != 0)))                                   \
        {                                                                \
          if (__builtin_constant_p (private) && (private) == LLL_PRIVATE) \
-           __lll_lock_wait_private (__futex);                           \
+           __lll_lock_wait_private (futex, __futex_value);              \
          else                                                           \
-           __lll_lock_wait (__futex, private);                          \
+           __lll_lock_wait (futex, __futex_value, private);             \
        }                                                                \
    }))
 #define lll_lock(futex, private)	\
@@ -120,7 +132,8 @@  libc_hidden_proto (__lll_lock_wait)
    ({                                                                   \
      int *__futex = (futex);                                            \
      if (__glibc_unlikely (atomic_exchange_acq (__futex, 2) != 0))      \
-       __lll_lock_wait (__futex, private);                              \
+       __lll_lock_wait (__futex, atomic_load_relaxed (__futex),         \
+			private); \
    }))
 #define lll_cond_lock(futex, private) __lll_cond_lock (&(futex), private)